import sys import os from pathlib import Path import cv2 import numpy as np import json import time import argparse # 确保输出编码为UTF-8 sys.stdout.reconfigure(encoding='utf-8') sys.stderr.reconfigure(encoding='utf-8') # 添加OnnxOCR路径 project_root = Path(__file__).parent.parent.parent onnxocr_path = project_root / 'python' / 'OnnxOCR-main' if onnxocr_path.exists(): sys.path.insert(0, str(onnxocr_path)) print(f"[INFO] 使用本地OnnxOCR路径: {onnxocr_path}") else: print(f"[ERROR] 未找到本地OnnxOCR路径: {onnxocr_path}") sys.exit(1) try: from onnxocr.onnx_paddleocr import ONNXPaddleOcr ONNXOCR_AVAILABLE = True except ImportError as e: print(f"[ERROR] 无法导入OnnxOCR模块: {e}") ONNXOCR_AVAILABLE = False sys.exit(1) def ocr_with_onnxocr_modes(image_path, text_mask_path, output_dir, mode="full"): """ 使用OnnxOCR进行OCR识别,支持不同模式 Args: image_path: 输入图片路径 text_mask_path: 文字遮罩路径(可以为空) output_dir: 输出目录 mode: OCR模式 - "full"(完整), "detect"(仅检测), "fast"(快速检测) """ if not ONNXOCR_AVAILABLE: print("[ERROR] OnnxOCR 不可用") return None # 创建输出目录 output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) print(f"[INFO] OCR模式: {mode}") print(f"[INFO] 输入图片: {image_path}") print(f"[INFO] 输出目录: {output_dir}") try: # 初始化OnnxOCR print("[INFO] 初始化OnnxOCR...") start_init = time.time() onnxocr_instance = ONNXPaddleOcr(use_angle_cls=True, use_gpu=False) print(f"[INFO] OnnxOCR 初始化完成 ({time.time()-start_init:.2f}秒)") # 读取图片 print(f"[INFO] 读取图片: {image_path}") img_array = np.fromfile(str(image_path), dtype=np.uint8) img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) if img is None: raise ValueError(f"无法读取图片: {image_path}") print(f"[INFO] 图片读取成功,尺寸: {img.shape}") # 根据模式执行不同的OCR操作 start_ocr = time.time() if mode == "full": # 完整OCR模式:检测+识别+角度分类 print("[INFO] 执行完整OCR识别(检测+识别+角度分类)...") ocr_result = onnxocr_instance.ocr(img, det=True, rec=True, cls=True) dialogues = [] if ocr_result and ocr_result[0]: for detection in ocr_result[0]: bbox, (text, confidence) = detection dialogues.append({ "bbox": bbox, "text": text, "confidence": float(confidence), "mode": "full_ocr" }) elif mode == "detect": # 仅检测模式:只检测文字区域,不识别文字 print("[INFO] 执行文字区域检测(仅坐标,不识别文字)...") detection_result = onnxocr_instance.ocr(img, det=True, rec=False, cls=False) dialogues = [] if detection_result and detection_result[0]: for i, bbox in enumerate(detection_result[0]): dialogues.append({ "bbox": bbox, "text": f"[区域{i+1}]", # 占位符文字 "confidence": 1.0, "mode": "detection_only" }) elif mode == "fast": # 快速检测模式:直接使用检测器 print("[INFO] 执行快速文字检测(直接检测器)...") dt_boxes = onnxocr_instance.text_detector(img) dialogues = [] if dt_boxes is not None and len(dt_boxes) > 0: for i, bbox in enumerate(dt_boxes): dialogues.append({ "bbox": bbox.tolist(), # 转换numpy数组为列表 "text": f"[快速检测{i+1}]", # 占位符文字 "confidence": 1.0, "mode": "fast_detection" }) else: raise ValueError(f"不支持的模式: {mode}") ocr_elapsed = time.time() - start_ocr print(f"[INFO] OCR处理完成 ({ocr_elapsed:.2f}秒)") print(f"[INFO] 检测到 {len(dialogues)} 个文字区域") # 保存结果到JSON文件 image_name = Path(image_path).stem output_json_path = output_dir / f"{image_name}_dialogues_{mode}.json" result_data = { "dialogues": dialogues, "total_dialogues": len(dialogues), "image_path": str(image_path), "ocr_engine": "OnnxOCR", "ocr_mode": mode, "processing_time": { "initialization": f"{start_init:.2f}s", "ocr_processing": f"{ocr_elapsed:.2f}s", "total": f"{time.time()-start_init:.2f}s" }, "performance_info": { "detected_regions": len(dialogues), "mode_description": { "full": "完整OCR:检测+识别+角度分类", "detect": "仅检测:只检测区域坐标,不识别文字", "fast": "快速检测:直接使用检测器,最快速度" }.get(mode, "未知模式") } } with open(output_json_path, 'w', encoding='utf-8') as f: json.dump(result_data, f, ensure_ascii=False, indent=2) print(f"[INFO] 结果已保存到: {output_json_path}") # 打印识别结果预览 print("[INFO] 识别结果预览:") for i, d in enumerate(dialogues[:5]): if mode == "full": print(f" {i+1}. '{d['text']}' (置信度: {d['confidence']:.3f})") else: bbox = d['bbox'] if isinstance(bbox[0], list): # 多边形格式 bbox_array = np.array(bbox) center_x = np.mean(bbox_array[:, 0]) center_y = np.mean(bbox_array[:, 1]) width = np.max(bbox_array[:, 0]) - np.min(bbox_array[:, 0]) height = np.max(bbox_array[:, 1]) - np.min(bbox_array[:, 1]) print(f" {i+1}. 区域中心({center_x:.0f},{center_y:.0f}) 尺寸({width:.0f}x{height:.0f})") if len(dialogues) > 5: print(f" ... 还有 {len(dialogues) - 5} 个区域") print(f"[SUCCESS] OCR识别完成,共处理 {len(dialogues)} 个区域") return { "json_path": str(output_json_path), "total_count": len(dialogues), "mode": mode, "processing_time": ocr_elapsed } except Exception as e: print(f"[ERROR] OCR处理失败: {e}") import traceback traceback.print_exc() return None if __name__ == '__main__': parser = argparse.ArgumentParser(description='OnnxOCR多模式文字识别') parser.add_argument('image_path', help='输入图片路径') parser.add_argument('text_mask_path', nargs='?', default='', help='文字遮罩路径(可选)') parser.add_argument('output_dir', help='输出目录') parser.add_argument('--mode', choices=['full', 'detect', 'fast'], default='full', help='OCR模式:full(完整OCR), detect(仅检测), fast(快速检测)') args = parser.parse_args() print(f"[DEBUG] 开始OCR处理...") print(f"[DEBUG] 参数: 图片={args.image_path}, 模式={args.mode}") result = ocr_with_onnxocr_modes(args.image_path, args.text_mask_path, args.output_dir, args.mode) if result: print(f"[SUCCESS] 处理完成: {result}") else: print("[ERROR] 处理失败") sys.exit(1)