yichael
/
AIStoryBoard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
							# -*- coding: utf-8 -*-
"""
使用PaddleOCR的文本检测模块进行精确的文字区域检测
返回精确的多边形坐标和边界框信息
"""

import sys
import json
import cv2
import numpy as np
import os
from pathlib import Path

# Windows编码修复
if sys.platform == 'win32':
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')

# 禁用 oneDNN 以避免 NotImplementedError
os.environ['FLAGS_onednn'] = '0'
os.environ['FLAGS_use_mkldnn'] = '0'
os.environ['FLAGS_enable_onednn_layout_fusion'] = '0'
# 跳过模型源检查，加快启动速度
# 注意：必须在导入 PaddleOCR 之前设置
# 正确的环境变量名是 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK
os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'

# 设置日志级别，减少不必要的日志输出
import logging
import warnings
logging.getLogger('paddlex').setLevel(logging.WARNING)
logging.getLogger('paddlex.inference').setLevel(logging.WARNING)
logging.getLogger('paddlex.inference.utils').setLevel(logging.WARNING)
logging.getLogger('paddlex.inference.utils.official_models').setLevel(logging.WARNING)
# 抑制 pkg_resources 的弃用警告
warnings.filterwarnings('ignore', category=UserWarning, message='.*pkg_resources.*')
warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*pkg_resources.*')
# 抑制 ccache 警告（这是 PaddlePaddle 的警告，不影响功能）
warnings.filterwarnings('ignore', message='.*ccache.*')

# 添加PaddleOCR路径
project_root = Path(__file__).parent.parent
# 尝试多个可能的路径
paddleocr_paths = [
    project_root / 'PaddleOCR-main',  # 直接路径
    project_root / 'PaddleOCR-main' / 'PaddleOCR-main',  # 嵌套路径
]
paddleocr_path = None
for path in paddleocr_paths:
    if path.exists() and (path / 'paddleocr').exists():
        paddleocr_path = path
        break

if paddleocr_path:
    sys.path.insert(0, str(paddleocr_path))
    print(f"[INFO] 使用本地PaddleOCR路径: {paddleocr_path}")
else:
    print(f"[WARN] 未找到本地PaddleOCR，尝试使用pip安装的版本")

try:
    from paddleocr import PaddleOCR
    PADDLEOCR_AVAILABLE = True
except ImportError as e:
    print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
    print("[ERROR] PaddleOCR 是必需的，请确保已正确安装")
    PADDLEOCR_AVAILABLE = False
    sys.exit(1)


def detect_text_regions(image_path, output_dir, min_confidence=0.5):
    """
    使用PaddleOCR的文本检测模块检测文字区域
    
    参数:
        image_path: 图片路径
        output_dir: 输出目录
        min_confidence: 最小置信度阈值
    
    返回:
        包含精确文字区域信息的JSON文件路径
    """
    image_path = Path(image_path)
    output_dir = Path(output_dir)
    
    # 使用Path.mkdir处理中文路径（比os.makedirs更可靠）
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📖 读取图片: {image_path.name}")
    print(f"[INFO] 图片完整路径: {image_path}")
    
    # 读取图片（处理中文路径）
    # 确保使用传入的原始图片路径（图1）进行检测
    img_array = np.fromfile(str(image_path), dtype=np.uint8)
    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    
    if img is None:
        raise ValueError(f"无法读取图片: {image_path}")
    
    # 确保是RGB格式
    if len(img.shape) == 2:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    elif img.shape[2] == 4:
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    elif img.shape[2] == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    img_height, img_width = img.shape[:2]
    print(f"[INFO] 图片尺寸: {img_width}x{img_height}")
    
    # 初始化PaddleOCR（只使用文本检测模块）
    print("[INFO] 初始化PaddleOCR文本检测模块...")
    try:
        # 直接指定本地模型路径，避免每次检查模型
        # 使用PP-OCRv5_server模型（最好的中文模型）
        import os
        user_home = os.path.expanduser('~')
        model_base_dir = os.path.join(user_home, '.paddlex', 'official_models')
        
        text_detection_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_det')
        text_recognition_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_rec')
        textline_orientation_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_textline_ori')
        doc_orientation_classify_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_doc_ori')
        doc_unwarping_model_dir = os.path.join(model_base_dir, 'UVDoc')
        
        # 检查模型目录是否存在
        if not os.path.exists(text_detection_model_dir):
            raise FileNotFoundError(f"检测模型目录不存在: {text_detection_model_dir}")
        if not os.path.exists(text_recognition_model_dir):
            raise FileNotFoundError(f"识别模型目录不存在: {text_recognition_model_dir}")
        
        # 直接指定模型目录，避免自动下载和检查
        paddleocr_instance = PaddleOCR(
            text_detection_model_dir=text_detection_model_dir,
            text_recognition_model_dir=text_recognition_model_dir,
            textline_orientation_model_dir=textline_orientation_model_dir,
            doc_orientation_classify_model_dir=doc_orientation_classify_model_dir,
            doc_unwarping_model_dir=doc_unwarping_model_dir,
            use_textline_orientation=False,  # 不使用文本行方向检测
            enable_mkldnn=False  # 明确禁用 MKL-DNN/oneDNN
        )
        print("[INFO] PaddleOCR 初始化成功（使用本地模型）")
        
    except Exception as e:
        print(f"[ERROR] PaddleOCR初始化失败: {e}")
        raise
    
    # 执行文本检测
    print("[INFO] 正在检测文字区域...")
    try:
        # 使用ocr方法进行OCR（PaddleOCR会同时进行检测和识别）
        # 但我们只使用检测结果（坐标信息），忽略识别结果（文字内容）
        # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...] 或 None
        result = paddleocr_instance.ocr(str(image_path))
        
        text_blocks = []
        
        if result and len(result) > 0:
            # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...]
            # result 是一个列表，每个元素是一个检测结果
            detection_results = result[0] if isinstance(result, list) and len(result) > 0 and isinstance(result[0], list) else result
            
            if detection_results:
                for idx, item in enumerate(detection_results):
                    if item is None:
                        continue
                    
                    # 提取坐标和置信度
                    coords = None
                    confidence = 0.9
                    text_content = ''
                    
                    if isinstance(item, list) and len(item) >= 2:
                        # 标准格式: [[坐标点], (文字, 置信度)]
                        coords = item[0]  # 多边形坐标 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
                        
                        # 第二个元素可能是元组 (文字, 置信度) 或列表
                        if isinstance(item[1], tuple) and len(item[1]) >= 2:
                            text_content = item[1][0] if len(item[1]) > 0 else ''
                            confidence = float(item[1][1]) if len(item[1]) > 1 else 0.9
                        elif isinstance(item[1], (int, float)):
                            confidence = float(item[1])
                    elif isinstance(item, list) and len(item) == 1:
                        # 只有坐标，没有识别结果
                        coords = item[0]
                        confidence = 0.9
                    elif isinstance(item, dict):
                        coords = item.get('dt_poly', []) or item.get('polygon', [])
                        confidence = float(item.get('dt_score', item.get('confidence', 0.9)))
                        text_content = item.get('text', '')
                    
                    # 过滤低置信度结果
                    if confidence < min_confidence:
                        continue
                    
                    if not coords or len(coords) < 4:
                        continue
                    
                    # 确保坐标格式正确
                    try:
                        # 计算边界框（从多边形坐标中提取）
                        x_coords = []
                        y_coords = []
                        
                        for point in coords:
                            if isinstance(point, (list, tuple)) and len(point) >= 2:
                                x_coords.append(float(point[0]))
                                y_coords.append(float(point[1]))
                        
                        if not x_coords or not y_coords:
                            continue
                        
                        x1 = int(min(x_coords))
                        y1 = int(min(y_coords))
                        x2 = int(max(x_coords))
                        y2 = int(max(y_coords))
                        
                        width = x2 - x1
                        height = y2 - y1
                        
                        # 过滤太小的区域（可能是噪点）
                        if width < 10 or height < 10:
                            continue
                        
                        area = width * height
                        center_x = (x1 + x2) / 2
                        center_y = (y1 + y2) / 2
                        
                        text_blocks.append({
                            'block_index': len(text_blocks) + 1,
                            'order': len(text_blocks) + 1,
                            'text': text_content,  # 如果有识别结果，保存文字内容
                            'bbox': {
                                'x1': x1,
                                'y1': y1,
                                'x2': x2,
                                'y2': y2,
                                'width': width,
                                'height': height,
                                'center_x': center_x,
                                'center_y': center_y
                            },
                            'polygon': coords,  # 保存精确的多边形坐标
                            'confidence': round(confidence, 4),
                            'center_x': round(center_x, 2),
                            'center_y': round(center_y, 2),
                            'width': width,
                            'height': height,
                            'area': area
                        })
                    except (TypeError, ValueError, IndexError) as e:
                        print(f"  [WARN] 第 {idx} 个检测结果解析失败: {e}，跳过")
                        continue
        
        print(f"[OK] 检测到 {len(text_blocks)} 个文字区域")
        
        # 计算总面积
        total_text_area = sum(block['area'] for block in text_blocks)
        total_image_area = img_width * img_height
        text_area_ratio = (total_text_area / total_image_area * 100) if total_image_area > 0 else 0
        
        # 生成结果JSON
        image_name = image_path.stem
        result_data = {
            'image_file': image_name + image_path.suffix,
            'image_size': {
                'width': img_width,
                'height': img_height,
                'total_area': total_image_area
            },
            'text_blocks': text_blocks,
            'total_count': len(text_blocks),
            'total_text_area': total_text_area,
            'text_area_ratio': round(text_area_ratio, 2)
        }
        
        # 保存JSON文件
        output_json = output_dir / f"{image_name}_text_blocks.json"
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(result_data, f, ensure_ascii=False, indent=2)
        
        print(f"[OK] 已保存检测结果: {output_json.name}")
        print(f"[INFO] 文字区域总面积: {total_text_area:.0f} 像素²")
        print(f"[INFO] 文字区域占比: {text_area_ratio:.2f}%")
        
        # 不再生成dialogues.json文件，因为后续流程只使用text_blocks.json
        # dialogues.json文件中的text字段通常是空的（只做检测不做识别），没有实际用途
        
        return str(output_json)
        
    except Exception as e:
        print(f"[ERROR] 文本检测失败: {e}")
        import traceback
        traceback.print_exc()
        raise


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("用法: python paddleocr_text_detection.py <图片路径> <输出目录> [最小置信度]")
        sys.exit(1)
    
    # 使用绝对路径避免编码问题
    image_path = str(Path(sys.argv[1]).resolve())
    output_dir = str(Path(sys.argv[2]).resolve())
    min_confidence = float(sys.argv[3]) if len(sys.argv) > 3 else 0.5
    
    try:
        detect_text_regions(image_path, output_dir, min_confidence)
    except Exception as e:
        print(f"[ERROR] 处理失败: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)