yichael
/
AIStoryBoard


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
							import sys
import os
from pathlib import Path
import cv2
import numpy as np
import json
import time
import argparse

# 确保输出编码为UTF-8
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')

# 添加OnnxOCR路径
project_root = Path(__file__).parent.parent.parent
onnxocr_path = project_root / 'python' / 'OnnxOCR-main'
if onnxocr_path.exists():
    sys.path.insert(0, str(onnxocr_path))
    print(f"[INFO] 使用本地OnnxOCR路径: {onnxocr_path}")
else:
    print(f"[ERROR] 未找到本地OnnxOCR路径: {onnxocr_path}")
    sys.exit(1)

try:
    from onnxocr.onnx_paddleocr import ONNXPaddleOcr
    ONNXOCR_AVAILABLE = True
except ImportError as e:
    print(f"[ERROR] 无法导入OnnxOCR模块: {e}")
    ONNXOCR_AVAILABLE = False
    sys.exit(1)

def ocr_with_onnxocr_modes(image_path, text_mask_path, output_dir, mode="full"):
    """
    使用OnnxOCR进行OCR识别，支持不同模式
    
    Args:
        image_path: 输入图片路径
        text_mask_path: 文字遮罩路径（可以为空）
        output_dir: 输出目录
        mode: OCR模式 - "full"(完整), "detect"(仅检测), "fast"(快速检测)
    """
    
    if not ONNXOCR_AVAILABLE:
        print("[ERROR] OnnxOCR 不可用")
        return None
    
    # 创建输出目录
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"[INFO] OCR模式: {mode}")
    print(f"[INFO] 输入图片: {image_path}")
    print(f"[INFO] 输出目录: {output_dir}")
    
    try:
        # 初始化OnnxOCR
        print("[INFO] 初始化OnnxOCR...")
        start_init = time.time()
        onnxocr_instance = ONNXPaddleOcr(use_angle_cls=True, use_gpu=False)
        print(f"[INFO] OnnxOCR 初始化完成 ({time.time()-start_init:.2f}秒)")
        
        # 读取图片
        print(f"[INFO] 读取图片: {image_path}")
        img_array = np.fromfile(str(image_path), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        
        if img is None:
            raise ValueError(f"无法读取图片: {image_path}")
        
        print(f"[INFO] 图片读取成功，尺寸: {img.shape}")
        
        # 根据模式执行不同的OCR操作
        start_ocr = time.time()
        
        if mode == "full":
            # 完整OCR模式：检测+识别+角度分类
            print("[INFO] 执行完整OCR识别（检测+识别+角度分类）...")
            ocr_result = onnxocr_instance.ocr(img, det=True, rec=True, cls=True)
            
            dialogues = []
            if ocr_result and ocr_result[0]:
                for detection in ocr_result[0]:
                    bbox, (text, confidence) = detection
                    dialogues.append({
                        "bbox": bbox,
                        "text": text,
                        "confidence": float(confidence),
                        "mode": "full_ocr"
                    })
            
        elif mode == "detect":
            # 仅检测模式：只检测文字区域，不识别文字
            print("[INFO] 执行文字区域检测（仅坐标，不识别文字）...")
            detection_result = onnxocr_instance.ocr(img, det=True, rec=False, cls=False)
            
            dialogues = []
            if detection_result and detection_result[0]:
                for i, bbox in enumerate(detection_result[0]):
                    dialogues.append({
                        "bbox": bbox,
                        "text": f"[区域{i+1}]",  # 占位符文字
                        "confidence": 1.0,
                        "mode": "detection_only"
                    })
        
        elif mode == "fast":
            # 快速检测模式：直接使用检测器
            print("[INFO] 执行快速文字检测（直接检测器）...")
            dt_boxes = onnxocr_instance.text_detector(img)
            
            dialogues = []
            if dt_boxes is not None and len(dt_boxes) > 0:
                for i, bbox in enumerate(dt_boxes):
                    dialogues.append({
                        "bbox": bbox.tolist(),  # 转换numpy数组为列表
                        "text": f"[快速检测{i+1}]",  # 占位符文字
                        "confidence": 1.0,
                        "mode": "fast_detection"
                    })
        
        else:
            raise ValueError(f"不支持的模式: {mode}")
        
        ocr_elapsed = time.time() - start_ocr
        print(f"[INFO] OCR处理完成 ({ocr_elapsed:.2f}秒)")
        print(f"[INFO] 检测到 {len(dialogues)} 个文字区域")
        
        # 保存结果到JSON文件
        image_name = Path(image_path).stem
        output_json_path = output_dir / f"{image_name}_dialogues_{mode}.json"
        
        result_data = {
            "dialogues": dialogues,
            "total_dialogues": len(dialogues),
            "image_path": str(image_path),
            "ocr_engine": "OnnxOCR",
            "ocr_mode": mode,
            "processing_time": {
                "initialization": f"{start_init:.2f}s",
                "ocr_processing": f"{ocr_elapsed:.2f}s",
                "total": f"{time.time()-start_init:.2f}s"
            },
            "performance_info": {
                "detected_regions": len(dialogues),
                "mode_description": {
                    "full": "完整OCR：检测+识别+角度分类",
                    "detect": "仅检测：只检测区域坐标，不识别文字",
                    "fast": "快速检测：直接使用检测器，最快速度"
                }.get(mode, "未知模式")
            }
        }
        
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(result_data, f, ensure_ascii=False, indent=2)
        
        print(f"[INFO] 结果已保存到: {output_json_path}")
        
        # 打印识别结果预览
        print("[INFO] 识别结果预览:")
        for i, d in enumerate(dialogues[:5]):
            if mode == "full":
                print(f"  {i+1}. '{d['text']}' (置信度: {d['confidence']:.3f})")
            else:
                bbox = d['bbox']
                if isinstance(bbox[0], list):  # 多边形格式
                    bbox_array = np.array(bbox)
                    center_x = np.mean(bbox_array[:, 0])
                    center_y = np.mean(bbox_array[:, 1])
                    width = np.max(bbox_array[:, 0]) - np.min(bbox_array[:, 0])
                    height = np.max(bbox_array[:, 1]) - np.min(bbox_array[:, 1])
                    print(f"  {i+1}. 区域中心({center_x:.0f},{center_y:.0f}) 尺寸({width:.0f}x{height:.0f})")
        
        if len(dialogues) > 5:
            print(f"  ... 还有 {len(dialogues) - 5} 个区域")
        
        print(f"[SUCCESS] OCR识别完成，共处理 {len(dialogues)} 个区域")
        
        return {
            "json_path": str(output_json_path), 
            "total_count": len(dialogues),
            "mode": mode,
            "processing_time": ocr_elapsed
        }
        
    except Exception as e:
        print(f"[ERROR] OCR处理失败: {e}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='OnnxOCR多模式文字识别')
    parser.add_argument('image_path', help='输入图片路径')
    parser.add_argument('text_mask_path', nargs='?', default='', help='文字遮罩路径（可选）')
    parser.add_argument('output_dir', help='输出目录')
    parser.add_argument('--mode', choices=['full', 'detect', 'fast'], default='full',
                       help='OCR模式：full(完整OCR), detect(仅检测), fast(快速检测)')
    
    args = parser.parse_args()
    
    print(f"[DEBUG] 开始OCR处理...")
    print(f"[DEBUG] 参数: 图片={args.image_path}, 模式={args.mode}")
    
    result = ocr_with_onnxocr_modes(args.image_path, args.text_mask_path, args.output_dir, args.mode)
    
    if result:
        print(f"[SUCCESS] 处理完成: {result}")
    else:
        print("[ERROR] 处理失败")
        sys.exit(1)