| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292 |
- """
- 文字识别和定位模块
- 功能:在截图中查找指定文字,并返回文字在截图中的坐标
- 使用 OnnxOCR 进行文字识别
- """
- import sys
- import os
- import cv2
- from pathlib import Path
- from typing import Optional, Tuple, Dict, Any
- # 添加 OnnxOCR 路径到 sys.path
- current_dir = Path(__file__).parent
- onnxocr_path = current_dir / 'OnnxOCR'
- if str(onnxocr_path) not in sys.path:
- sys.path.insert(0, str(onnxocr_path))
- from onnxocr.onnx_paddleocr import ONNXPaddleOcr
- # 设置环境变量,跳过模型源连接检查(避免首次运行时超时)
- os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
- def find_text_location(
- screenshot_path: str,
- target_text: str,
- device_width: int = None,
- device_height: int = None,
- use_angle_cls: bool = True,
- lang: str = 'ch'
- ) -> Optional[Dict[str, Any]]:
- """
- 在截图中查找目标文字并返回坐标
-
- Args:
- screenshot_path: 截图文件路径
- target_text: 要查找的文字
- device_width: 设备实际宽度(像素),如果提供则会将坐标转换到设备分辨率
- device_height: 设备实际高度(像素),如果提供则会将坐标转换到设备分辨率
- use_angle_cls: 是否使用角度分类器,默认True (对应 OnnxOCR 的 use_angle_cls)
- lang: 语言类型,'ch'表示中英文混合,'en'表示英文,默认'ch' (OnnxOCR 内部处理)
-
- Returns:
- 如果找到文字,返回包含坐标信息的字典:
- {
- "found": True,
- "x": 中心点x坐标,
- "y": 中心点y坐标,
- "width": 文字框宽度,
- "height": 文字框高度,
- "bbox": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # 文字框的四个角点
- }
- 如果未找到,返回 {"found": False}
- """
- # 检查文件是否存在
- screenshot = Path(screenshot_path)
- if not screenshot.exists():
- raise FileNotFoundError(f"截图文件不存在: {screenshot_path}")
-
- # 检查目标文字是否为空
- if not target_text or not target_text.strip():
- raise ValueError("目标文字不能为空")
-
- try:
- # 读取截图以获取实际尺寸
- img = cv2.imread(str(screenshot))
- if img is None:
- raise ValueError(f"无法读取截图文件: {screenshot_path}")
-
- screenshot_height, screenshot_width = img.shape[:2]
-
- # 计算缩放比例(如果提供了设备分辨率)
- scale_x = 1.0
- scale_y = 1.0
- if device_width is not None and device_height is not None:
- scale_x = device_width / screenshot_width
- scale_y = device_height / screenshot_height
-
- # 初始化 OnnxOCR(首次调用可能需要一些时间)
- # OnnxOCR 需要传入图片对象,而不是路径
- ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
-
- # 识别截图中的所有文字(OnnxOCR 需要传入 cv2 读取的图片对象)
- result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
-
- # 如果识别结果为空
- if not result or not result[0]:
- return {"found": False}
-
- # 在识别结果中查找目标文字
- # result[0] 是一个列表,每个元素是一行文字的识别结果
- # 格式: [[[x1,y1], [x2,y2], [x3,y3], [x4,y4]], (文字内容, 置信度)]
- for line in result[0]:
- if not line:
- continue
-
- # line[0] 是四个角点坐标
- # line[1] 是 (文字内容, 置信度)
- bbox = line[0] # 四个角点: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
- text_info = line[1]
- recognized_text = text_info[0] # 识别的文字内容
- confidence = text_info[1] # 置信度
-
- # 检查识别的文字是否包含目标文字(支持部分匹配和完全匹配)
- # 使用 in 操作符支持部分匹配,如果需要完全匹配可以使用 ==
- if target_text in recognized_text or recognized_text in target_text:
- # 找到匹配,计算文字框的中心点和尺寸
- # bbox 是四个角点的列表: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
- x_coords = [point[0] for point in bbox]
- y_coords = [point[1] for point in bbox]
-
- # 计算边界框(基于截图尺寸)
- min_x = int(min(x_coords))
- max_x = int(max(x_coords))
- min_y = int(min(y_coords))
- max_y = int(max(y_coords))
-
- # 将坐标转换到设备分辨率(如果提供了设备分辨率)
- min_x = int(min_x * scale_x)
- max_x = int(max_x * scale_x)
- min_y = int(min_y * scale_y)
- max_y = int(max_y * scale_y)
-
- # 转换 bbox 坐标
- scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
-
- # 计算中心点(基于设备分辨率)
- center_x = int((min_x + max_x) / 2)
- center_y = int((min_y + max_y) / 2)
-
- # 计算宽度和高度(基于设备分辨率)
- width = max_x - min_x
- height = max_y - min_y
-
- return {
- "found": True,
- "x": center_x,
- "y": center_y,
- "width": width,
- "height": height,
- "bbox": scaled_bbox, # 已转换到设备分辨率的坐标
- "text": recognized_text, # 实际识别的文字
- "confidence": float(confidence) # 置信度
- }
-
- # 未找到匹配的文字
- return {"found": False}
-
- except Exception as e:
- raise RuntimeError(f"OCR 识别过程中出错: {str(e)}")
- def find_text_location_multiple(
- screenshot_path: str,
- target_text: str,
- device_width: int = None,
- device_height: int = None,
- use_angle_cls: bool = True,
- lang: str = 'ch'
- ) -> list:
- """
- 在截图中查找目标文字的所有出现位置(可能有多处匹配)
-
- Args:
- screenshot_path: 截图文件路径
- target_text: 要查找的文字
- device_width: 设备实际宽度(像素)
- device_height: 设备实际高度(像素)
- use_angle_cls: 是否使用角度分类器
- lang: 语言类型
-
- Returns:
- 返回所有匹配位置的列表,每个元素为包含坐标信息的字典
- """
- screenshot = Path(screenshot_path)
- if not screenshot.exists():
- raise FileNotFoundError(f"截图文件不存在: {screenshot_path}")
-
- if not target_text or not target_text.strip():
- raise ValueError("目标文字不能为空")
-
- try:
- # 读取截图以获取实际尺寸
- img = cv2.imread(str(screenshot))
- if img is None:
- raise ValueError(f"无法读取截图文件: {screenshot_path}")
-
- screenshot_height, screenshot_width = img.shape[:2]
-
- # 计算缩放比例(如果提供了设备分辨率)
- scale_x = 1.0
- scale_y = 1.0
- if device_width is not None and device_height is not None:
- scale_x = device_width / screenshot_width
- scale_y = device_height / screenshot_height
-
- # 初始化 OnnxOCR
- ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
- # OnnxOCR 需要传入图片对象
- result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
-
- if not result or not result[0]:
- return []
-
- matches = []
- for line in result[0]:
- if not line:
- continue
-
- bbox = line[0]
- text_info = line[1]
- recognized_text = text_info[0]
- confidence = text_info[1]
-
- # 检查是否匹配
- if target_text in recognized_text or recognized_text in target_text:
- x_coords = [point[0] for point in bbox]
- y_coords = [point[1] for point in bbox]
-
- # 计算边界框(基于截图尺寸)
- min_x = int(min(x_coords))
- max_x = int(max(x_coords))
- min_y = int(min(y_coords))
- max_y = int(max(y_coords))
-
- # 将坐标转换到设备分辨率(如果提供了设备分辨率)
- min_x = int(min_x * scale_x)
- max_x = int(max_x * scale_x)
- min_y = int(min_y * scale_y)
- max_y = int(max_y * scale_y)
-
- # 转换 bbox 坐标
- scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
-
- center_x = int((min_x + max_x) / 2)
- center_y = int((min_y + max_y) / 2)
- width = max_x - min_x
- height = max_y - min_y
-
- matches.append({
- "found": True,
- "x": center_x,
- "y": center_y,
- "width": width,
- "height": height,
- "bbox": scaled_bbox,
- "text": recognized_text,
- "confidence": float(confidence)
- })
-
- return matches
-
- except Exception as e:
- raise RuntimeError(f"OCR 识别过程中出错: {str(e)}")
- if __name__ == "__main__":
- # 测试示例
- if len(sys.argv) < 3:
- print("用法: python string-reg-location.py <截图路径> <要查找的文字> [设备宽度] [设备高度]")
- print("示例: python string-reg-location.py screenshot.png \"你好\" 1080 2400")
- sys.exit(1)
-
- screenshot_path = sys.argv[1]
- target_text = sys.argv[2]
-
- device_width = None
- device_height = None
- if len(sys.argv) >= 5:
- try:
- device_width = int(sys.argv[3])
- device_height = int(sys.argv[4])
- except ValueError:
- print("警告: 无法解析设备分辨率,将使用截图原始尺寸")
-
- try:
- result = find_text_location(screenshot_path, target_text, device_width, device_height)
- if result.get("found"):
- x = result["x"]
- y = result["y"]
- w = result["width"]
- h = result["height"]
- print(f"找到文字!坐标: x={x}, y={y}, 宽度={w}, 高度={h}")
- print(f"识别的文字: {result.get('text', '')}")
- print(f"置信度: {result.get('confidence', 0):.2f}")
- print(f"JSON格式: {{\"x\": {x}, \"y\": {y}, \"width\": {w}, \"height\": {h}}}")
- else:
- print("未找到匹配的文字")
- except Exception as e:
- print(f"错误: {e}")
- sys.exit(1)
|