yichael
/
AIStoryBoard


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
							# -*- coding: utf-8 -*-
"""
使用PaddleOCR识别图片中的文字
"""
import sys
import json
import cv2
import numpy as np
import os
from pathlib import Path

# ========== 必须在所有导入之前设置环境变量 ==========
# 跳过模型源检查，加快启动速度（必须在导入 PaddleOCR 之前设置）
# 注意：正确的环境变量名是 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK
# 设置为 'True' 会跳过连接检查，设置为 'False' 或不设置会进行连接检查
# 由于我们已经直接指定了本地模型路径，可以禁用这个检查
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'

# 禁用 oneDNN 以避免 NotImplementedError（PaddlePaddle 3.3.0 的已知问题）
# 必须在导入 PaddlePaddle 之前设置
os.environ['FLAGS_onednn'] = '0'
os.environ['FLAGS_use_mkldnn'] = '0'
os.environ['FLAGS_enable_onednn_layout_fusion'] = '0'
os.environ['FLAGS_use_mkldnn'] = 'false'
os.environ['FLAGS_onednn'] = 'false'
# 禁用 oneDNN 的更多选项
os.environ['FLAGS_use_mkldnn'] = 'OFF'
os.environ['FLAGS_onednn'] = 'OFF'

# 设置日志级别，减少不必要的日志输出
# 注意：必须在导入 logging 相关模块之前设置
import logging
import warnings
# 设置 paddlex 的日志级别为 WARNING，减少不必要的日志输出
logging.getLogger('paddlex').setLevel(logging.WARNING)
logging.getLogger('paddlex.inference').setLevel(logging.WARNING)
logging.getLogger('paddlex.inference.utils').setLevel(logging.WARNING)
logging.getLogger('paddlex.inference.utils.official_models').setLevel(logging.WARNING)
# 抑制 pkg_resources 的弃用警告
warnings.filterwarnings('ignore', category=UserWarning, message='.*pkg_resources.*')
warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*pkg_resources.*')
# 抑制 ccache 警告（这是 PaddlePaddle 的警告，不影响功能）
warnings.filterwarnings('ignore', message='.*ccache.*')
# ==================================================

# Windows编码修复
if sys.platform == 'win32':
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')

# 添加PaddleOCR路径
project_root = Path(__file__).parent.parent
# 尝试多个可能的路径
paddleocr_paths = [
    project_root / 'PaddleOCR-main',  # 直接路径
    project_root / 'PaddleOCR-main' / 'PaddleOCR-main',  # 嵌套路径
]
paddleocr_path = None
for path in paddleocr_paths:
    if path.exists() and (path / 'paddleocr').exists():
        paddleocr_path = path
        break

if paddleocr_path:
    sys.path.insert(0, str(paddleocr_path))
    print(f"[INFO] 使用本地PaddleOCR路径: {paddleocr_path}")
else:
    print(f"[WARN] 未找到本地PaddleOCR，尝试使用pip安装的版本")

try:
    from paddleocr import PaddleOCR
    PADDLEOCR_AVAILABLE = True
except ImportError as e:
    print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
    print("[ERROR] PaddleOCR 是必需的，请确保已正确安装")
    PADDLEOCR_AVAILABLE = False
    sys.exit(1)

def ocr_with_paddleocr(image_path, text_mask_path, output_dir):
    """
    使用PaddleOCR识别图片中的文字
    
    参数:
        image_path: 原始图片路径
        text_mask_path: 文字遮罩图路径（用于参考，可选，当前未使用）
        output_dir: 输出目录
    """
    image_path = Path(image_path)
    # 处理空字符串的情况
    text_mask_path = Path(text_mask_path) if text_mask_path and text_mask_path.strip() else None
    output_dir = Path(output_dir)
    
    # 使用 Path.mkdir 处理中文路径，比 os.makedirs 更可靠
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📖 读取原始图片: {image_path.name}")
    
    # 读取原始图片（处理中文路径）
    img_array = np.fromfile(str(image_path), dtype=np.uint8)
    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    
    if img is None:
        raise ValueError(f"无法读取图片: {image_path}")
    
    # 确保是RGB格式（3通道）
    if len(img.shape) == 2:
        # 如果是灰度图，转换为RGB
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    elif img.shape[2] == 4:
        # 如果是RGBA，转换为RGB
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    elif img.shape[2] == 3:
        # BGR转RGB
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    img_height, img_width = img.shape[:2]
    print(f"[INFO] 图片尺寸: {img_width}x{img_height}")
    
    # 初始化PaddleOCR
    print("[INFO] 初始化PaddleOCR...")
    try:
        # 使用简单的初始化方式，让 PaddleOCR 自动使用本地已下载的模型
        # 这样更稳定，避免直接指定模型路径可能导致的初始化问题
        paddleocr_instance = PaddleOCR(
            lang='ch',  # 中文
            enable_mkldnn=False  # 明确禁用 MKL-DNN/oneDNN
        )
        print("[INFO] PaddleOCR 初始化成功")
    except Exception as e:
        print(f"[ERROR] PaddleOCR初始化失败: {e}")
        import traceback
        traceback.print_exc()
        raise RuntimeError(f"PaddleOCR 初始化失败: {e}")
    
    # 执行OCR识别
    print("[INFO] 正在识别文字...")
    print(f"[DEBUG] 图片数组信息: shape={img.shape}, dtype={img.dtype}, min={img.min()}, max={img.max()}")
    try:
        # 使用已读取的图片数组，传递给 PaddleOCR（避免中文路径问题）
        print(f"[DEBUG] 准备调用 paddleocr_instance.predict...")
        import sys
        sys.stdout.flush()  # 确保输出被刷新
        ocr_result = paddleocr_instance.predict(img)
        print(f"[DEBUG] OCR结果类型: {type(ocr_result)}, 长度: {len(ocr_result) if ocr_result else 0}")
        sys.stdout.flush()
    except Exception as e:
        print(f"[ERROR] OCR识别失败: {e}")
        import traceback
        traceback.print_exc()
        sys.stdout.flush()
        raise
    
    if not ocr_result or len(ocr_result) == 0:
        print("[WARN] 未识别到任何文字")
        dialogues = []
    else:
        # 解析PaddleOCR结果
        result_item = ocr_result[0]
        
        # PaddleOCR 3.x 返回的是 OCRResult 对象，通过 .json 属性获取数据
        try:
            result_json = result_item.json
            res_data = result_json.get('res', {}) if isinstance(result_json, dict) else {}
            
            # 提取文本、置信度、坐标
            rec_texts = res_data.get('rec_texts', [])
            rec_scores = res_data.get('rec_scores', [])
            rec_polys = res_data.get('rec_polys', [])  # 多边形坐标 [[[x1,y1],[x2,y2],[x3,y3],[x4,y4]], ...]
            rec_boxes = res_data.get('rec_boxes', [])  # 边界框 [[x1,y1,x2,y2], ...]
            
            print(f"[OK] 识别到 {len(rec_texts)} 个文本区域")
            
            # 提取对话文本
            dialogues = []
            for idx, text in enumerate(rec_texts):
                if not text or not text.strip():
                    continue
                
                # 获取置信度
                confidence = float(rec_scores[idx]) if idx < len(rec_scores) else 0.9
                
                # 获取坐标（优先使用多边形坐标，如果没有则使用边界框）
                if idx < len(rec_polys) and rec_polys[idx]:
                    bbox_coords = rec_polys[idx]  # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
                elif idx < len(rec_boxes) and rec_boxes[idx]:
                    # 将边界框转换为多边形格式
                    box = rec_boxes[idx]  # [x1, y1, x2, y2]
                    bbox_coords = [
                        [box[0], box[1]],  # 左上
                        [box[2], box[1]],  # 右上
                        [box[2], box[3]],  # 右下
                        [box[0], box[3]]   # 左下
                    ]
                else:
                    print(f"  [WARN] 第 {idx} 个文本没有坐标信息，跳过")
                    continue
                
                # 计算边界框
                if not isinstance(bbox_coords, (list, tuple)) or len(bbox_coords) < 4:
                    print(f"  [WARN] 第 {idx} 个文本坐标格式不正确，跳过")
                    continue
                
                try:
                    x_coords = []
                    y_coords = []
                    for coord in bbox_coords:
                        if isinstance(coord, (list, tuple)) and len(coord) >= 2:
                            x_coords.append(coord[0])
                            y_coords.append(coord[1])
                    
                    if not x_coords or not y_coords or len(x_coords) < 4:
                        print(f"  [WARN] 第 {idx} 个文本无法提取足够的坐标点，跳过")
                        continue
                    
                    x1 = int(min(x_coords))
                    y1 = int(min(y_coords))
                    x2 = int(max(x_coords))
                    y2 = int(max(y_coords))
                    
                    dialogues.append({
                        'order': len(dialogues) + 1,
                        'text': text.strip(),
                        'bbox': {
                            'x1': x1,
                            'y1': y1,
                            'x2': x2,
                            'y2': y2,
                            'width': x2 - x1,
                            'height': y2 - y1,
                            'center_x': float((x1 + x2) / 2),
                            'center_y': float((y1 + y2) / 2)
                        },
                        'confidence': confidence
                    })
                    print(f"  [{len(dialogues)}/{len(rec_texts)}] {text[:50]}...")
                except (TypeError, IndexError, ValueError) as e:
                    print(f"  [WARN] 第 {idx} 个文本解析坐标失败: {e}，跳过")
                    continue
        
        except Exception as e:
            print(f"[ERROR] 解析PaddleOCR结果失败: {e}")
            import traceback
            traceback.print_exc()
            dialogues = []
    
    # 保存结果
    image_name = image_path.stem
    output_json = {
        'image_file': f"{image_name}{image_path.suffix}",
        'reading_order': '从右到左、从上到下（日式漫画阅读顺序）',
        'dialogues': dialogues,
        'total_count': len(dialogues)
    }
    
    output_file = output_dir / f"{image_name}_dialogues.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_json, f, ensure_ascii=False, indent=2)
    
    print(f"\n✅ 结果已保存到: {output_file}")
    return output_file

if __name__ == '__main__':
    try:
        print(f"[DEBUG] sys.argv: {sys.argv}")
        print(f"[DEBUG] sys.argv长度: {len(sys.argv)}")
        # 至少需要3个参数：脚本名、图片路径、输出目录
        # text_mask_path 是可选的，可以为空（空字符串会被shell忽略）
        if len(sys.argv) < 3:
            print("用法: python ocr_with_paddleocr.py <原始图片路径> [文字遮罩图路径] <输出目录>")
            sys.exit(1)
        
        image_path = sys.argv[1]
        # 如果只有3个参数，说明没有 text_mask_path（空字符串被忽略），output_dir 是第二个参数
        if len(sys.argv) == 3:
            text_mask_path = ""
            output_dir = sys.argv[2]
        elif len(sys.argv) >= 4:
            # 有4个或更多参数：脚本名、图片路径、text_mask_path、输出目录
            text_mask_path = sys.argv[2]
            output_dir = sys.argv[3]
        else:
            # 不应该到这里
            raise ValueError("参数数量不正确")
        
        print(f"[DEBUG] 参数: image_path={image_path}, text_mask_path={text_mask_path}, output_dir={output_dir}")
        
        # 验证图片路径是否存在
        if not Path(image_path).exists():
            raise FileNotFoundError(f"图片文件不存在: {image_path}")
        
        ocr_with_paddleocr(image_path, text_mask_path, output_dir)
    except KeyboardInterrupt:
        print("[INFO] 用户中断")
        sys.exit(1)
    except Exception as e:
        print(f"[ERROR] OCR识别失败: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)