# -*- coding: utf-8 -*- """ 使用PaddleOCR的文本检测模块进行精确的文字区域检测 返回精确的多边形坐标和边界框信息 """ import sys import json import cv2 import numpy as np import os from pathlib import Path # Windows编码修复 if sys.platform == 'win32': import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') # 禁用 oneDNN 以避免 NotImplementedError os.environ['FLAGS_onednn'] = '0' os.environ['FLAGS_use_mkldnn'] = '0' os.environ['FLAGS_enable_onednn_layout_fusion'] = '0' # 跳过模型源检查,加快启动速度 # 注意:必须在导入 PaddleOCR 之前设置 # 正确的环境变量名是 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True' # 设置日志级别,减少不必要的日志输出 import logging import warnings logging.getLogger('paddlex').setLevel(logging.WARNING) logging.getLogger('paddlex.inference').setLevel(logging.WARNING) logging.getLogger('paddlex.inference.utils').setLevel(logging.WARNING) logging.getLogger('paddlex.inference.utils.official_models').setLevel(logging.WARNING) # 抑制 pkg_resources 的弃用警告 warnings.filterwarnings('ignore', category=UserWarning, message='.*pkg_resources.*') warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*pkg_resources.*') # 抑制 ccache 警告(这是 PaddlePaddle 的警告,不影响功能) warnings.filterwarnings('ignore', message='.*ccache.*') # 添加PaddleOCR路径 project_root = Path(__file__).parent.parent # 尝试多个可能的路径 paddleocr_paths = [ project_root / 'PaddleOCR-main', # 直接路径 project_root / 'PaddleOCR-main' / 'PaddleOCR-main', # 嵌套路径 ] paddleocr_path = None for path in paddleocr_paths: if path.exists() and (path / 'paddleocr').exists(): paddleocr_path = path break if paddleocr_path: sys.path.insert(0, str(paddleocr_path)) print(f"[INFO] 使用本地PaddleOCR路径: {paddleocr_path}") else: print(f"[WARN] 未找到本地PaddleOCR,尝试使用pip安装的版本") try: from paddleocr import PaddleOCR PADDLEOCR_AVAILABLE = True except ImportError as e: print(f"[ERROR] 无法导入PaddleOCR模块: {e}") print("[ERROR] PaddleOCR 是必需的,请确保已正确安装") PADDLEOCR_AVAILABLE = False sys.exit(1) def detect_text_regions(image_path, output_dir, min_confidence=0.5): """ 使用PaddleOCR的文本检测模块检测文字区域 参数: image_path: 图片路径 output_dir: 输出目录 min_confidence: 最小置信度阈值 返回: 包含精确文字区域信息的JSON文件路径 """ image_path = Path(image_path) output_dir = Path(output_dir) # 使用Path.mkdir处理中文路径(比os.makedirs更可靠) output_dir.mkdir(parents=True, exist_ok=True) print(f"📖 读取图片: {image_path.name}") print(f"[INFO] 图片完整路径: {image_path}") # 读取图片(处理中文路径) # 确保使用传入的原始图片路径(图1)进行检测 img_array = np.fromfile(str(image_path), dtype=np.uint8) img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) if img is None: raise ValueError(f"无法读取图片: {image_path}") # 确保是RGB格式 if len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) elif img.shape[2] == 4: img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB) elif img.shape[2] == 3: img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img_height, img_width = img.shape[:2] print(f"[INFO] 图片尺寸: {img_width}x{img_height}") # 初始化PaddleOCR(只使用文本检测模块) print("[INFO] 初始化PaddleOCR文本检测模块...") try: # 直接指定本地模型路径,避免每次检查模型 # 使用PP-OCRv5_server模型(最好的中文模型) import os user_home = os.path.expanduser('~') model_base_dir = os.path.join(user_home, '.paddlex', 'official_models') text_detection_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_det') text_recognition_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_rec') textline_orientation_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_textline_ori') doc_orientation_classify_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_doc_ori') doc_unwarping_model_dir = os.path.join(model_base_dir, 'UVDoc') # 检查模型目录是否存在 if not os.path.exists(text_detection_model_dir): raise FileNotFoundError(f"检测模型目录不存在: {text_detection_model_dir}") if not os.path.exists(text_recognition_model_dir): raise FileNotFoundError(f"识别模型目录不存在: {text_recognition_model_dir}") # 直接指定模型目录,避免自动下载和检查 paddleocr_instance = PaddleOCR( text_detection_model_dir=text_detection_model_dir, text_recognition_model_dir=text_recognition_model_dir, textline_orientation_model_dir=textline_orientation_model_dir, doc_orientation_classify_model_dir=doc_orientation_classify_model_dir, doc_unwarping_model_dir=doc_unwarping_model_dir, use_textline_orientation=False, # 不使用文本行方向检测 enable_mkldnn=False # 明确禁用 MKL-DNN/oneDNN ) print("[INFO] PaddleOCR 初始化成功(使用本地模型)") except Exception as e: print(f"[ERROR] PaddleOCR初始化失败: {e}") raise # 执行文本检测 print("[INFO] 正在检测文字区域...") try: # 使用ocr方法进行OCR(PaddleOCR会同时进行检测和识别) # 但我们只使用检测结果(坐标信息),忽略识别结果(文字内容) # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...] 或 None result = paddleocr_instance.ocr(str(image_path)) text_blocks = [] if result and len(result) > 0: # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...] # result 是一个列表,每个元素是一个检测结果 detection_results = result[0] if isinstance(result, list) and len(result) > 0 and isinstance(result[0], list) else result if detection_results: for idx, item in enumerate(detection_results): if item is None: continue # 提取坐标和置信度 coords = None confidence = 0.9 text_content = '' if isinstance(item, list) and len(item) >= 2: # 标准格式: [[坐标点], (文字, 置信度)] coords = item[0] # 多边形坐标 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] # 第二个元素可能是元组 (文字, 置信度) 或列表 if isinstance(item[1], tuple) and len(item[1]) >= 2: text_content = item[1][0] if len(item[1]) > 0 else '' confidence = float(item[1][1]) if len(item[1]) > 1 else 0.9 elif isinstance(item[1], (int, float)): confidence = float(item[1]) elif isinstance(item, list) and len(item) == 1: # 只有坐标,没有识别结果 coords = item[0] confidence = 0.9 elif isinstance(item, dict): coords = item.get('dt_poly', []) or item.get('polygon', []) confidence = float(item.get('dt_score', item.get('confidence', 0.9))) text_content = item.get('text', '') # 过滤低置信度结果 if confidence < min_confidence: continue if not coords or len(coords) < 4: continue # 确保坐标格式正确 try: # 计算边界框(从多边形坐标中提取) x_coords = [] y_coords = [] for point in coords: if isinstance(point, (list, tuple)) and len(point) >= 2: x_coords.append(float(point[0])) y_coords.append(float(point[1])) if not x_coords or not y_coords: continue x1 = int(min(x_coords)) y1 = int(min(y_coords)) x2 = int(max(x_coords)) y2 = int(max(y_coords)) width = x2 - x1 height = y2 - y1 # 过滤太小的区域(可能是噪点) if width < 10 or height < 10: continue area = width * height center_x = (x1 + x2) / 2 center_y = (y1 + y2) / 2 text_blocks.append({ 'block_index': len(text_blocks) + 1, 'order': len(text_blocks) + 1, 'text': text_content, # 如果有识别结果,保存文字内容 'bbox': { 'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2, 'width': width, 'height': height, 'center_x': center_x, 'center_y': center_y }, 'polygon': coords, # 保存精确的多边形坐标 'confidence': round(confidence, 4), 'center_x': round(center_x, 2), 'center_y': round(center_y, 2), 'width': width, 'height': height, 'area': area }) except (TypeError, ValueError, IndexError) as e: print(f" [WARN] 第 {idx} 个检测结果解析失败: {e},跳过") continue print(f"[OK] 检测到 {len(text_blocks)} 个文字区域") # 计算总面积 total_text_area = sum(block['area'] for block in text_blocks) total_image_area = img_width * img_height text_area_ratio = (total_text_area / total_image_area * 100) if total_image_area > 0 else 0 # 生成结果JSON image_name = image_path.stem result_data = { 'image_file': image_name + image_path.suffix, 'image_size': { 'width': img_width, 'height': img_height, 'total_area': total_image_area }, 'text_blocks': text_blocks, 'total_count': len(text_blocks), 'total_text_area': total_text_area, 'text_area_ratio': round(text_area_ratio, 2) } # 保存JSON文件 output_json = output_dir / f"{image_name}_text_blocks.json" with open(output_json, 'w', encoding='utf-8') as f: json.dump(result_data, f, ensure_ascii=False, indent=2) print(f"[OK] 已保存检测结果: {output_json.name}") print(f"[INFO] 文字区域总面积: {total_text_area:.0f} 像素²") print(f"[INFO] 文字区域占比: {text_area_ratio:.2f}%") # 不再生成dialogues.json文件,因为后续流程只使用text_blocks.json # dialogues.json文件中的text字段通常是空的(只做检测不做识别),没有实际用途 return str(output_json) except Exception as e: print(f"[ERROR] 文本检测失败: {e}") import traceback traceback.print_exc() raise if __name__ == '__main__': if len(sys.argv) < 3: print("用法: python paddleocr_text_detection.py <图片路径> <输出目录> [最小置信度]") sys.exit(1) # 使用绝对路径避免编码问题 image_path = str(Path(sys.argv[1]).resolve()) output_dir = str(Path(sys.argv[2]).resolve()) min_confidence = float(sys.argv[3]) if len(sys.argv) > 3 else 0.5 try: detect_text_regions(image_path, output_dir, min_confidence) except Exception as e: print(f"[ERROR] 处理失败: {e}") import traceback traceback.print_exc() sys.exit(1)