string-reg-location.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. """
  2. 文字识别和定位模块
  3. 功能:在截图中查找指定文字,并返回文字在截图中的坐标
  4. 使用 OnnxOCR 进行文字识别
  5. """
  6. import sys
  7. import os
  8. import cv2
  9. from pathlib import Path
  10. from typing import Optional, Tuple, Dict, Any
  11. # 添加 OnnxOCR 路径到 sys.path
  12. current_dir = Path(__file__).parent
  13. onnxocr_path = current_dir / 'OnnxOCR'
  14. if str(onnxocr_path) not in sys.path:
  15. sys.path.insert(0, str(onnxocr_path))
  16. from onnxocr.onnx_paddleocr import ONNXPaddleOcr
  17. # 设置环境变量,跳过模型源连接检查(避免首次运行时超时)
  18. os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
  19. def find_text_location(
  20. screenshot_path: str,
  21. target_text: str,
  22. device_width: int = None,
  23. device_height: int = None,
  24. use_angle_cls: bool = True,
  25. lang: str = 'ch'
  26. ) -> Optional[Dict[str, Any]]:
  27. """
  28. 在截图中查找目标文字并返回坐标
  29. Args:
  30. screenshot_path: 截图文件路径
  31. target_text: 要查找的文字
  32. device_width: 设备实际宽度(像素),如果提供则会将坐标转换到设备分辨率
  33. device_height: 设备实际高度(像素),如果提供则会将坐标转换到设备分辨率
  34. use_angle_cls: 是否使用角度分类器,默认True (对应 OnnxOCR 的 use_angle_cls)
  35. lang: 语言类型,'ch'表示中英文混合,'en'表示英文,默认'ch' (OnnxOCR 内部处理)
  36. Returns:
  37. 如果找到文字,返回包含坐标信息的字典:
  38. {
  39. "found": True,
  40. "x": 中心点x坐标,
  41. "y": 中心点y坐标,
  42. "width": 文字框宽度,
  43. "height": 文字框高度,
  44. "bbox": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # 文字框的四个角点
  45. }
  46. 如果未找到,返回 {"found": False}
  47. """
  48. # 检查文件是否存在
  49. screenshot = Path(screenshot_path)
  50. if not screenshot.exists():
  51. raise FileNotFoundError(f"截图文件不存在: {screenshot_path}")
  52. # 检查目标文字是否为空
  53. if not target_text or not target_text.strip():
  54. raise ValueError("目标文字不能为空")
  55. try:
  56. # 读取截图以获取实际尺寸
  57. img = cv2.imread(str(screenshot))
  58. if img is None:
  59. raise ValueError(f"无法读取截图文件: {screenshot_path}")
  60. screenshot_height, screenshot_width = img.shape[:2]
  61. # 计算缩放比例(如果提供了设备分辨率)
  62. scale_x = 1.0
  63. scale_y = 1.0
  64. if device_width is not None and device_height is not None:
  65. scale_x = device_width / screenshot_width
  66. scale_y = device_height / screenshot_height
  67. # 初始化 OnnxOCR(首次调用可能需要一些时间)
  68. # OnnxOCR 需要传入图片对象,而不是路径
  69. ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
  70. # 识别截图中的所有文字(OnnxOCR 需要传入 cv2 读取的图片对象)
  71. result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
  72. # 如果识别结果为空
  73. if not result or not result[0]:
  74. return {"found": False}
  75. # 在识别结果中查找目标文字
  76. # result[0] 是一个列表,每个元素是一行文字的识别结果
  77. # 格式: [[[x1,y1], [x2,y2], [x3,y3], [x4,y4]], (文字内容, 置信度)]
  78. for line in result[0]:
  79. if not line:
  80. continue
  81. # line[0] 是四个角点坐标
  82. # line[1] 是 (文字内容, 置信度)
  83. bbox = line[0] # 四个角点: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
  84. text_info = line[1]
  85. recognized_text = text_info[0] # 识别的文字内容
  86. confidence = text_info[1] # 置信度
  87. # 检查识别的文字是否包含目标文字(支持部分匹配和完全匹配)
  88. # 使用 in 操作符支持部分匹配,如果需要完全匹配可以使用 ==
  89. if target_text in recognized_text or recognized_text in target_text:
  90. # 找到匹配,计算文字框的中心点和尺寸
  91. # bbox 是四个角点的列表: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
  92. x_coords = [point[0] for point in bbox]
  93. y_coords = [point[1] for point in bbox]
  94. # 计算边界框(基于截图尺寸)
  95. min_x = int(min(x_coords))
  96. max_x = int(max(x_coords))
  97. min_y = int(min(y_coords))
  98. max_y = int(max(y_coords))
  99. # 将坐标转换到设备分辨率(如果提供了设备分辨率)
  100. min_x = int(min_x * scale_x)
  101. max_x = int(max_x * scale_x)
  102. min_y = int(min_y * scale_y)
  103. max_y = int(max_y * scale_y)
  104. # 转换 bbox 坐标
  105. scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
  106. # 计算中心点(基于设备分辨率)
  107. center_x = int((min_x + max_x) / 2)
  108. center_y = int((min_y + max_y) / 2)
  109. # 计算宽度和高度(基于设备分辨率)
  110. width = max_x - min_x
  111. height = max_y - min_y
  112. return {
  113. "found": True,
  114. "x": center_x,
  115. "y": center_y,
  116. "width": width,
  117. "height": height,
  118. "bbox": scaled_bbox, # 已转换到设备分辨率的坐标
  119. "text": recognized_text, # 实际识别的文字
  120. "confidence": float(confidence) # 置信度
  121. }
  122. # 未找到匹配的文字
  123. return {"found": False}
  124. except Exception as e:
  125. raise RuntimeError(f"OCR 识别过程中出错: {str(e)}")
  126. def find_text_location_multiple(
  127. screenshot_path: str,
  128. target_text: str,
  129. device_width: int = None,
  130. device_height: int = None,
  131. use_angle_cls: bool = True,
  132. lang: str = 'ch'
  133. ) -> list:
  134. """
  135. 在截图中查找目标文字的所有出现位置(可能有多处匹配)
  136. Args:
  137. screenshot_path: 截图文件路径
  138. target_text: 要查找的文字
  139. device_width: 设备实际宽度(像素)
  140. device_height: 设备实际高度(像素)
  141. use_angle_cls: 是否使用角度分类器
  142. lang: 语言类型
  143. Returns:
  144. 返回所有匹配位置的列表,每个元素为包含坐标信息的字典
  145. """
  146. screenshot = Path(screenshot_path)
  147. if not screenshot.exists():
  148. raise FileNotFoundError(f"截图文件不存在: {screenshot_path}")
  149. if not target_text or not target_text.strip():
  150. raise ValueError("目标文字不能为空")
  151. try:
  152. # 读取截图以获取实际尺寸
  153. img = cv2.imread(str(screenshot))
  154. if img is None:
  155. raise ValueError(f"无法读取截图文件: {screenshot_path}")
  156. screenshot_height, screenshot_width = img.shape[:2]
  157. # 计算缩放比例(如果提供了设备分辨率)
  158. scale_x = 1.0
  159. scale_y = 1.0
  160. if device_width is not None and device_height is not None:
  161. scale_x = device_width / screenshot_width
  162. scale_y = device_height / screenshot_height
  163. # 初始化 OnnxOCR
  164. ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
  165. # OnnxOCR 需要传入图片对象
  166. result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
  167. if not result or not result[0]:
  168. return []
  169. matches = []
  170. for line in result[0]:
  171. if not line:
  172. continue
  173. bbox = line[0]
  174. text_info = line[1]
  175. recognized_text = text_info[0]
  176. confidence = text_info[1]
  177. # 检查是否匹配
  178. if target_text in recognized_text or recognized_text in target_text:
  179. x_coords = [point[0] for point in bbox]
  180. y_coords = [point[1] for point in bbox]
  181. # 计算边界框(基于截图尺寸)
  182. min_x = int(min(x_coords))
  183. max_x = int(max(x_coords))
  184. min_y = int(min(y_coords))
  185. max_y = int(max(y_coords))
  186. # 将坐标转换到设备分辨率(如果提供了设备分辨率)
  187. min_x = int(min_x * scale_x)
  188. max_x = int(max_x * scale_x)
  189. min_y = int(min_y * scale_y)
  190. max_y = int(max_y * scale_y)
  191. # 转换 bbox 坐标
  192. scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
  193. center_x = int((min_x + max_x) / 2)
  194. center_y = int((min_y + max_y) / 2)
  195. width = max_x - min_x
  196. height = max_y - min_y
  197. matches.append({
  198. "found": True,
  199. "x": center_x,
  200. "y": center_y,
  201. "width": width,
  202. "height": height,
  203. "bbox": scaled_bbox,
  204. "text": recognized_text,
  205. "confidence": float(confidence)
  206. })
  207. return matches
  208. except Exception as e:
  209. raise RuntimeError(f"OCR 识别过程中出错: {str(e)}")
  210. if __name__ == "__main__":
  211. # 测试示例
  212. if len(sys.argv) < 3:
  213. print("用法: python string-reg-location.py <截图路径> <要查找的文字> [设备宽度] [设备高度]")
  214. print("示例: python string-reg-location.py screenshot.png \"你好\" 1080 2400")
  215. sys.exit(1)
  216. screenshot_path = sys.argv[1]
  217. target_text = sys.argv[2]
  218. device_width = None
  219. device_height = None
  220. if len(sys.argv) >= 5:
  221. try:
  222. device_width = int(sys.argv[3])
  223. device_height = int(sys.argv[4])
  224. except ValueError:
  225. print("警告: 无法解析设备分辨率,将使用截图原始尺寸")
  226. try:
  227. result = find_text_location(screenshot_path, target_text, device_width, device_height)
  228. if result.get("found"):
  229. x = result["x"]
  230. y = result["y"]
  231. w = result["width"]
  232. h = result["height"]
  233. print(f"找到文字!坐标: x={x}, y={y}, 宽度={w}, 高度={h}")
  234. print(f"识别的文字: {result.get('text', '')}")
  235. print(f"置信度: {result.get('confidence', 0):.2f}")
  236. print(f"JSON格式: {{\"x\": {x}, \"y\": {y}, \"width\": {w}, \"height\": {h}}}")
  237. else:
  238. print("未找到匹配的文字")
  239. except Exception as e:
  240. print(f"错误: {e}")
  241. sys.exit(1)