yichael
/
AutoAndroidController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
							"""
鏂囧瓧璇嗗埆鍜屽畾浣嶆ā鍧?鍔熻兘锛氬湪鎴浘涓煡鎵炬寚瀹氭枃瀛楋紝骞惰繑鍥炴枃瀛楀湪鎴浘涓殑鍧愭爣
浣跨敤 OnnxOCR 杩涜鏂囧瓧璇嗗埆
"""

import sys
import os
import cv2
from pathlib import Path
from typing import Optional, Tuple, Dict, Any

# 娣诲姞 OnnxOCR 璺緞鍒?sys.path
current_dir = Path(__file__).parent
onnxocr_path = current_dir / 'OnnxOCR'
if str(onnxocr_path) not in sys.path:
    sys.path.insert(0, str(onnxocr_path))

from onnxocr.onnx_paddleocr import ONNXPaddleOcr

# 璁剧疆鐜鍙橀噺锛岃烦杩囨ā鍨嬫簮杩炴帴妫€鏌ワ紙閬垮厤棣栨杩愯鏃惰秴鏃讹級
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'


def find_text_location(
    screenshot_path: str,
    target_text: str,
    device_width: int = None,
    device_height: int = None,
    use_angle_cls: bool = True,
    lang: str = 'ch'
) -> Optional[Dict[str, Any]]:
    """
    鍦ㄦ埅鍥句腑鏌ユ壘鐩爣鏂囧瓧骞惰繑鍥炲潗鏍?    
    Args:
        screenshot_path: 鎴浘鏂囦欢璺緞
        target_text: 瑕佹煡鎵剧殑鏂囧瓧
        device_width: 璁惧瀹為檯瀹藉害锛堝儚绱狅級锛屽鏋滄彁渚涘垯浼氬皢鍧愭爣杞崲鍒拌澶囧垎杈ㄧ巼
        device_height: 璁惧瀹為檯楂樺害锛堝儚绱狅級锛屽鏋滄彁渚涘垯浼氬皢鍧愭爣杞崲鍒拌澶囧垎杈ㄧ巼
        use_angle_cls: 鏄惁浣跨敤瑙掑害鍒嗙被鍣紝榛樿True (瀵瑰簲 OnnxOCR 鐨?use_angle_cls)
        lang: 璇█绫诲瀷锛?ch'琛ㄧず涓嫳鏂囨贩鍚堬紝'en'琛ㄧず鑻辨枃锛岄粯璁?ch' (OnnxOCR 鍐呴儴澶勭悊)
    
    Returns:
        濡傛灉鎵惧埌鏂囧瓧锛岃繑鍥炲寘鍚潗鏍囦俊鎭殑瀛楀吀锛?        {
            "found": True,
            "x": 涓績鐐箈鍧愭爣,
            "y": 涓績鐐箉鍧愭爣,
            "width": 鏂囧瓧妗嗗搴?
            "height": 鏂囧瓧妗嗛珮搴?
            "bbox": [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]  # 鏂囧瓧妗嗙殑鍥涗釜瑙掔偣
        }
        濡傛灉鏈壘鍒帮紝杩斿洖 {"found": False}
    """
    # 妫€鏌ユ枃浠舵槸鍚﹀瓨鍦?    screenshot = Path(screenshot_path)
    if not screenshot.exists():
        raise FileNotFoundError(f"鎴浘鏂囦欢涓嶅瓨鍦? {screenshot_path}")
    
    # 妫€鏌ョ洰鏍囨枃瀛楁槸鍚︿负绌?    if not target_text or not target_text.strip():
        raise ValueError("鐩爣鏂囧瓧涓嶈兘涓虹┖")
    
    try:
        # 璇诲彇鎴浘浠ヨ幏鍙栧疄闄呭昂瀵?        img = cv2.imread(str(screenshot))
        if img is None:
            raise ValueError(f"鏃犳硶璇诲彇鎴浘鏂囦欢: {screenshot_path}")
        
        screenshot_height, screenshot_width = img.shape[:2]
        
        # 璁＄畻缂╂斁姣斾緥锛堝鏋滄彁渚涗簡璁惧鍒嗚鲸鐜囷級
        scale_x = 1.0
        scale_y = 1.0
        if device_width is not None and device_height is not None:
            scale_x = device_width / screenshot_width
            scale_y = device_height / screenshot_height
        
        # 鍒濆鍖?OnnxOCR锛堥娆¤皟鐢ㄥ彲鑳介渶瑕佷竴浜涙椂闂达級
        # OnnxOCR 闇€瑕佷紶鍏ュ浘鐗囧璞★紝鑰屼笉鏄矾寰?        ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
        
        # 璇嗗埆鎴浘涓殑鎵€鏈夋枃瀛楋紙OnnxOCR 闇€瑕佷紶鍏?cv2 璇诲彇鐨勫浘鐗囧璞★級
        result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
        
        # 濡傛灉璇嗗埆缁撴灉涓虹┖
        if not result or not result[0]:
            return {"found": False}
        
        # 鍦ㄨ瘑鍒粨鏋滀腑鏌ユ壘鐩爣鏂囧瓧
        # result[0] 鏄竴涓垪琛紝姣忎釜鍏冪礌鏄竴琛屾枃瀛楃殑璇嗗埆缁撴灉
        # 鏍煎紡: [[[x1,y1], [x2,y2], [x3,y3], [x4,y4]], (鏂囧瓧鍐呭, 缃俊搴?]
        for line in result[0]:
            if not line:
                continue
            
            # line[0] 鏄洓涓鐐瑰潗鏍?            # line[1] 鏄?(鏂囧瓧鍐呭, 缃俊搴?
            bbox = line[0]  # 鍥涗釜瑙掔偣: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
            text_info = line[1]
            recognized_text = text_info[0]  # 璇嗗埆鐨勬枃瀛楀唴瀹?            confidence = text_info[1]  # 缃俊搴?            
            # 妫€鏌ヨ瘑鍒殑鏂囧瓧鏄惁鍖呭惈鐩爣鏂囧瓧锛堟敮鎸侀儴鍒嗗尮閰嶅拰瀹屽叏鍖归厤锛?            # 浣跨敤 in 鎿嶄綔绗︽敮鎸侀儴鍒嗗尮閰嶏紝濡傛灉闇€瑕佸畬鍏ㄥ尮閰嶅彲浠ヤ娇鐢?==
            if target_text in recognized_text or recognized_text in target_text:
                # 鎵惧埌鍖归厤锛岃绠楁枃瀛楁鐨勪腑蹇冪偣鍜屽昂瀵?                # bbox 鏄洓涓鐐圭殑鍒楄〃: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
                x_coords = [point[0] for point in bbox]
                y_coords = [point[1] for point in bbox]
                
                # 璁＄畻杈圭晫妗嗭紙鍩轰簬鎴浘灏哄锛?                min_x = int(min(x_coords))
                max_x = int(max(x_coords))
                min_y = int(min(y_coords))
                max_y = int(max(y_coords))
                
                # 灏嗗潗鏍囪浆鎹㈠埌璁惧鍒嗚鲸鐜囷紙濡傛灉鎻愪緵浜嗚澶囧垎杈ㄧ巼锛?                min_x = int(min_x * scale_x)
                max_x = int(max_x * scale_x)
                min_y = int(min_y * scale_y)
                max_y = int(max_y * scale_y)
                
                # 杞崲 bbox 鍧愭爣
                scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
                
                # 璁＄畻涓績鐐癸紙鍩轰簬璁惧鍒嗚鲸鐜囷級
                center_x = int((min_x + max_x) / 2)
                center_y = int((min_y + max_y) / 2)
                
                # 璁＄畻瀹藉害鍜岄珮搴︼紙鍩轰簬璁惧鍒嗚鲸鐜囷級
                width = max_x - min_x
                height = max_y - min_y
                
                return {
                    "found": True,
                    "x": center_x,
                    "y": center_y,
                    "width": width,
                    "height": height,
                    "bbox": scaled_bbox,  # 宸茶浆鎹㈠埌璁惧鍒嗚鲸鐜囩殑鍧愭爣
                    "text": recognized_text,  # 瀹為檯璇嗗埆鐨勬枃瀛?                    "confidence": float(confidence)  # 缃俊搴?                }
        
        # 鏈壘鍒板尮閰嶇殑鏂囧瓧
        return {"found": False}
        
    except Exception as e:
        raise RuntimeError(f"OCR 璇嗗埆杩囩▼涓嚭閿? {str(e)}")


def find_text_location_multiple(
    screenshot_path: str,
    target_text: str,
    device_width: int = None,
    device_height: int = None,
    use_angle_cls: bool = True,
    lang: str = 'ch'
) -> list:
    """
    鍦ㄦ埅鍥句腑鏌ユ壘鐩爣鏂囧瓧鐨勬墍鏈夊嚭鐜颁綅缃紙鍙兘鏈夊澶勫尮閰嶏級
    
    Args:
        screenshot_path: 鎴浘鏂囦欢璺緞
        target_text: 瑕佹煡鎵剧殑鏂囧瓧
        device_width: 璁惧瀹為檯瀹藉害锛堝儚绱狅級
        device_height: 璁惧瀹為檯楂樺害锛堝儚绱狅級
        use_angle_cls: 鏄惁浣跨敤瑙掑害鍒嗙被鍣?        lang: 璇█绫诲瀷
    
    Returns:
        杩斿洖鎵€鏈夊尮閰嶄綅缃殑鍒楄〃锛屾瘡涓厓绱犱负鍖呭惈鍧愭爣淇℃伅鐨勫瓧鍏?    """
    screenshot = Path(screenshot_path)
    if not screenshot.exists():
        raise FileNotFoundError(f"鎴浘鏂囦欢涓嶅瓨鍦? {screenshot_path}")
    
    if not target_text or not target_text.strip():
        raise ValueError("鐩爣鏂囧瓧涓嶈兘涓虹┖")
    
    try:
        # 璇诲彇鎴浘浠ヨ幏鍙栧疄闄呭昂瀵?        img = cv2.imread(str(screenshot))
        if img is None:
            raise ValueError(f"鏃犳硶璇诲彇鎴浘鏂囦欢: {screenshot_path}")
        
        screenshot_height, screenshot_width = img.shape[:2]
        
        # 璁＄畻缂╂斁姣斾緥锛堝鏋滄彁渚涗簡璁惧鍒嗚鲸鐜囷級
        scale_x = 1.0
        scale_y = 1.0
        if device_width is not None and device_height is not None:
            scale_x = device_width / screenshot_width
            scale_y = device_height / screenshot_height
        
        # 鍒濆鍖?OnnxOCR
        ocr = ONNXPaddleOcr(use_angle_cls=use_angle_cls, use_gpu=False)
        # OnnxOCR 闇€瑕佷紶鍏ュ浘鐗囧璞?        result = ocr.ocr(img, det=True, rec=True, cls=use_angle_cls)
        
        if not result or not result[0]:
            return []
        
        matches = []
        for line in result[0]:
            if not line:
                continue
            
            bbox = line[0]
            text_info = line[1]
            recognized_text = text_info[0]
            confidence = text_info[1]
            
            # 妫€鏌ユ槸鍚﹀尮閰?            if target_text in recognized_text or recognized_text in target_text:
                x_coords = [point[0] for point in bbox]
                y_coords = [point[1] for point in bbox]
                
                # 璁＄畻杈圭晫妗嗭紙鍩轰簬鎴浘灏哄锛?                min_x = int(min(x_coords))
                max_x = int(max(x_coords))
                min_y = int(min(y_coords))
                max_y = int(max(y_coords))
                
                # 灏嗗潗鏍囪浆鎹㈠埌璁惧鍒嗚鲸鐜囷紙濡傛灉鎻愪緵浜嗚澶囧垎杈ㄧ巼锛?                min_x = int(min_x * scale_x)
                max_x = int(max_x * scale_x)
                min_y = int(min_y * scale_y)
                max_y = int(max_y * scale_y)
                
                # 杞崲 bbox 鍧愭爣
                scaled_bbox = [[int(p[0] * scale_x), int(p[1] * scale_y)] for p in bbox]
                
                center_x = int((min_x + max_x) / 2)
                center_y = int((min_y + max_y) / 2)
                width = max_x - min_x
                height = max_y - min_y
                
                matches.append({
                    "found": True,
                    "x": center_x,
                    "y": center_y,
                    "width": width,
                    "height": height,
                    "bbox": scaled_bbox,
                    "text": recognized_text,
                    "confidence": float(confidence)
                })
        
        return matches
        
    except Exception as e:
        raise RuntimeError(f"OCR 璇嗗埆杩囩▼涓嚭閿? {str(e)}")


if __name__ == "__main__":
    # 娴嬭瘯绀轰緥
    if len(sys.argv) < 3:
        print("鐢ㄦ硶: python string-reg-location.py <鎴浘璺緞> <瑕佹煡鎵剧殑鏂囧瓧> [璁惧瀹藉害] [璁惧楂樺害]")
        print("绀轰緥: python string-reg-location.py screenshot.png \"浣犲ソ\" 1080 2400")
        sys.exit(1)
    
    screenshot_path = sys.argv[1]
    target_text = sys.argv[2]
    
    device_width = None
    device_height = None
    if len(sys.argv) >= 5:
        try:
            device_width = int(sys.argv[3])
            device_height = int(sys.argv[4])
        except ValueError:
            print("璀﹀憡: 鏃犳硶瑙ｆ瀽璁惧鍒嗚鲸鐜囷紝灏嗕娇鐢ㄦ埅鍥惧師濮嬪昂瀵?)
    
    try:
        result = find_text_location(screenshot_path, target_text, device_width, device_height)
        if result.get("found"):
            x = result["x"]
            y = result["y"]
            w = result["width"]
            h = result["height"]
            print(f"鎵惧埌鏂囧瓧锛佸潗鏍? x={x}, y={y}, 瀹藉害={w}, 楂樺害={h}")
            print(f"璇嗗埆鐨勬枃瀛? {result.get('text', '')}")
            print(f"缃俊搴? {result.get('confidence', 0):.2f}")
            print(f"JSON鏍煎紡: {{\"x\": {x}, \"y\": {y}, \"width\": {w}, \"height\": {h}}}")
        else:
            print("鏈壘鍒板尮閰嶇殑鏂囧瓧")
    except Exception as e:
        print(f"閿欒: {e}")
        sys.exit(1)