yichael
/
AutoAndroidController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
							"""
文字识别和定位模块
功能：在截图中查找指定文字，并返回文字在截图中的坐标
使用 OnnxOCR 进行文字识别
"""

import sys
import os
import cv2
from pathlib import Path
from typing import Optional, Tuple, Dict, Any

# 添加 OnnxOCR 路径到 sys.path
current_dir = Path(__file__).parent
onnxocr_path = current_dir / 'OnnxOCR'
if str(onnxocr_path) not in sys.path:
    sys.path.insert(0, str(onnxocr_path))

from onnxocr.onnx_paddleocr import ONNXPaddleOcr

# 设置环境变量，跳过模型源连接检查（避免首次运行时超时）
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'


def find_text_location(
    screenshot_path: str,
    target_text: str,
    device_width: int = None,
    device_height: int = None,
    use_angle_cls: bool = True,
    lang: str = 'ch'
) -> Optional[Dict[str, Any]]:
    """
    在截图中查找目标文字并返回坐标
    
    Args:
        screenshot_path: 截图文件路径
        target_text: 要查找的文字
        device_width: 设备实际宽度（像素），如果提供则会将坐标转换到设备分辨率
        device_height: 设备实际高度（像素），如果提供则会将坐标转换到设备分辨率
        use_angle_cls: 是否使用角度分类器，默认True (对应 OnnxOCR 的 use_angle_cls)
        lang: 语言类型，'ch'表示中英文混合，'en'表示英文，默认'ch' (OnnxOCR 内部处理)
    
    Returns:
        如果找到文字，返回包含坐标信息的字典：
        {
            "found": True,
            "x": 中心点x坐标,
            "y": 中心点y坐标,
            "width": 文字框宽度,
            "height": 文字框高度,
            "bbox": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]  # 文字框的四个角点
        }
        如果未找到，返回 {"found": False}
    """
    # 检查文件是否存在
    screenshot = Path(screenshot_path)
    if not screenshot.exists():
        raise FileNotFoundError(f"截图文件不存在: {screenshot_path}")
    
    # 检查目标文字是否为空
    if not target_text or not target_text.strip():
        raise ValueError("目标文字不能为空")
    
    try:
        # 读取截图以获取实际尺寸
        img = cv2.imread(str(screenshot))
        if img is None:
            raise ValueError(f"无法读取截图文件: {screenshot_path}")
        
        screenshot_height, screenshot_width = img.shape[:2]
        
        # 计算缩放比例（如果提供了设备分辨率）
        scale_x = 1.0
        scale_y = 1.0
        if device_width is not None and device_height is not None:
            scale_x = device_width / screenshot_width
            scale_y = device_height / screenshot_height
        
        # 初始化 OnnxOCR（首次调用可能需要一些时间）
        # OnnxOCR 需要传入图片对象，而不是路径
        ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
        
        # 识别截图中的所有文字（OnnxOCR 需要传入 cv2 读取的图片对象）
        result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
        
        # 如果识别结果为空
        if not result or not result[0]:
            return {"found": False}
        
        # 在识别结果中查找目标文字
        # result[0] 是一个列表，每个元素是一行文字的识别结果
        # 格式: [[[x1,y1], [x2,y2], [x3,y3], [x4,y4]], (文字内容, 置信度)]
        for line in result[0]:
            if not line:
                continue
            
            # line[0] 是四个角点坐标
            # line[1] 是 (文字内容, 置信度)
            bbox = line[0]  # 四个角点: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            text_info = line[1]
            recognized_text = text_info[0]  # 识别的文字内容
            confidence = text_info[1]  # 置信度
            
            # 检查识别的文字是否包含目标文字（支持部分匹配和完全匹配）
            # 使用 in 操作符支持部分匹配，如果需要完全匹配可以使用 ==
            if target_text in recognized_text or recognized_text in target_text:
                # 找到匹配，计算文字框的中心点和尺寸
                # bbox 是四个角点的列表: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
                x_coords = [point[0] for point in bbox]
                y_coords = [point[1] for point in bbox]
                
                # 计算边界框（基于截图尺寸）
                min_x = int(min(x_coords))
                max_x = int(max(x_coords))
                min_y = int(min(y_coords))
                max_y = int(max(y_coords))
                
                # 将坐标转换到设备分辨率（如果提供了设备分辨率）
                min_x = int(min_x * scale_x)
                max_x = int(max_x * scale_x)
                min_y = int(min_y * scale_y)
                max_y = int(max_y * scale_y)
                
                # 转换 bbox 坐标
                scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
                
                # 计算中心点（基于设备分辨率）
                center_x = int((min_x + max_x) / 2)
                center_y = int((min_y + max_y) / 2)
                
                # 计算宽度和高度（基于设备分辨率）
                width = max_x - min_x
                height = max_y - min_y
                
                return {
                    "found": True,
                    "x": center_x,
                    "y": center_y,
                    "width": width,
                    "height": height,
                    "bbox": scaled_bbox,  # 已转换到设备分辨率的坐标
                    "text": recognized_text,  # 实际识别的文字
                    "confidence": float(confidence)  # 置信度
                }
        
        # 未找到匹配的文字
        return {"found": False}
        
    except Exception as e:
        raise RuntimeError(f"OCR 识别过程中出错: {str(e)}")


def find_text_location_multiple(
    screenshot_path: str,
    target_text: str,
    device_width: int = None,
    device_height: int = None,
    use_angle_cls: bool = True,
    lang: str = 'ch'
) -> list:
    """
    在截图中查找目标文字的所有出现位置（可能有多处匹配）
    
    Args:
        screenshot_path: 截图文件路径
        target_text: 要查找的文字
        device_width: 设备实际宽度（像素）
        device_height: 设备实际高度（像素）
        use_angle_cls: 是否使用角度分类器
        lang: 语言类型
    
    Returns:
        返回所有匹配位置的列表，每个元素为包含坐标信息的字典
    """
    screenshot = Path(screenshot_path)
    if not screenshot.exists():
        raise FileNotFoundError(f"截图文件不存在: {screenshot_path}")
    
    if not target_text or not target_text.strip():
        raise ValueError("目标文字不能为空")
    
    try:
        # 读取截图以获取实际尺寸
        img = cv2.imread(str(screenshot))
        if img is None:
            raise ValueError(f"无法读取截图文件: {screenshot_path}")
        
        screenshot_height, screenshot_width = img.shape[:2]
        
        # 计算缩放比例（如果提供了设备分辨率）
        scale_x = 1.0
        scale_y = 1.0
        if device_width is not None and device_height is not None:
            scale_x = device_width / screenshot_width
            scale_y = device_height / screenshot_height
        
        # 初始化 OnnxOCR
        ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
        # OnnxOCR 需要传入图片对象
        result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
        
        if not result or not result[0]:
            return []
        
        matches = []
        for line in result[0]:
            if not line:
                continue
            
            bbox = line[0]
            text_info = line[1]
            recognized_text = text_info[0]
            confidence = text_info[1]
            
            # 检查是否匹配
            if target_text in recognized_text or recognized_text in target_text:
                x_coords = [point[0] for point in bbox]
                y_coords = [point[1] for point in bbox]
                
                # 计算边界框（基于截图尺寸）
                min_x = int(min(x_coords))
                max_x = int(max(x_coords))
                min_y = int(min(y_coords))
                max_y = int(max(y_coords))
                
                # 将坐标转换到设备分辨率（如果提供了设备分辨率）
                min_x = int(min_x * scale_x)
                max_x = int(max_x * scale_x)
                min_y = int(min_y * scale_y)
                max_y = int(max_y * scale_y)
                
                # 转换 bbox 坐标
                scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
                
                center_x = int((min_x + max_x) / 2)
                center_y = int((min_y + max_y) / 2)
                width = max_x - min_x
                height = max_y - min_y
                
                matches.append({
                    "found": True,
                    "x": center_x,
                    "y": center_y,
                    "width": width,
                    "height": height,
                    "bbox": scaled_bbox,
                    "text": recognized_text,
                    "confidence": float(confidence)
                })
        
        return matches
        
    except Exception as e:
        raise RuntimeError(f"OCR 识别过程中出错: {str(e)}")


if __name__ == "__main__":
    # 测试示例
    if len(sys.argv) < 3:
        print("用法: python string-reg-location.py <截图路径> <要查找的文字> [设备宽度] [设备高度]")
        print("示例: python string-reg-location.py screenshot.png \"你好\" 1080 2400")
        sys.exit(1)
    
    screenshot_path = sys.argv[1]
    target_text = sys.argv[2]
    
    device_width = None
    device_height = None
    if len(sys.argv) >= 5:
        try:
            device_width = int(sys.argv[3])
            device_height = int(sys.argv[4])
        except ValueError:
            print("警告: 无法解析设备分辨率，将使用截图原始尺寸")
    
    try:
        result = find_text_location(screenshot_path, target_text, device_width, device_height)
        if result.get("found"):
            x = result["x"]
            y = result["y"]
            w = result["width"]
            h = result["height"]
            print(f"找到文字！坐标: x={x}, y={y}, 宽度={w}, 高度={h}")
            print(f"识别的文字: {result.get('text', '')}")
            print(f"置信度: {result.get('confidence', 0):.2f}")
            print(f"JSON格式: {{\"x\": {x}, \"y\": {y}, \"width\": {w}, \"height\": {h}}}")
        else:
            print("未找到匹配的文字")
    except Exception as e:
        print(f"错误: {e}")
        sys.exit(1)