| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314 |
- # -*- coding: utf-8 -*-
- """
- 使用PaddleOCR的文本检测模块进行精确的文字区域检测
- 返回精确的多边形坐标和边界框信息
- """
- import sys
- import json
- import cv2
- import numpy as np
- import os
- from pathlib import Path
- # Windows编码修复
- if sys.platform == 'win32':
- import io
- sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
- sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
- # 禁用 oneDNN 以避免 NotImplementedError
- os.environ['FLAGS_onednn'] = '0'
- os.environ['FLAGS_use_mkldnn'] = '0'
- os.environ['FLAGS_enable_onednn_layout_fusion'] = '0'
- # 跳过模型源检查,加快启动速度
- # 注意:必须在导入 PaddleOCR 之前设置
- # 正确的环境变量名是 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK
- os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
- # 设置日志级别,减少不必要的日志输出
- import logging
- import warnings
- logging.getLogger('paddlex').setLevel(logging.WARNING)
- logging.getLogger('paddlex.inference').setLevel(logging.WARNING)
- logging.getLogger('paddlex.inference.utils').setLevel(logging.WARNING)
- logging.getLogger('paddlex.inference.utils.official_models').setLevel(logging.WARNING)
- # 抑制 pkg_resources 的弃用警告
- warnings.filterwarnings('ignore', category=UserWarning, message='.*pkg_resources.*')
- warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*pkg_resources.*')
- # 抑制 ccache 警告(这是 PaddlePaddle 的警告,不影响功能)
- warnings.filterwarnings('ignore', message='.*ccache.*')
- # 添加PaddleOCR路径
- project_root = Path(__file__).parent.parent
- # 尝试多个可能的路径
- paddleocr_paths = [
- project_root / 'PaddleOCR-main', # 直接路径
- project_root / 'PaddleOCR-main' / 'PaddleOCR-main', # 嵌套路径
- ]
- paddleocr_path = None
- for path in paddleocr_paths:
- if path.exists() and (path / 'paddleocr').exists():
- paddleocr_path = path
- break
- if paddleocr_path:
- sys.path.insert(0, str(paddleocr_path))
- print(f"[INFO] 使用本地PaddleOCR路径: {paddleocr_path}")
- else:
- print(f"[WARN] 未找到本地PaddleOCR,尝试使用pip安装的版本")
- try:
- from paddleocr import PaddleOCR
- PADDLEOCR_AVAILABLE = True
- except ImportError as e:
- print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
- print("[ERROR] PaddleOCR 是必需的,请确保已正确安装")
- PADDLEOCR_AVAILABLE = False
- sys.exit(1)
- def detect_text_regions(image_path, output_dir, min_confidence=0.5):
- """
- 使用PaddleOCR的文本检测模块检测文字区域
-
- 参数:
- image_path: 图片路径
- output_dir: 输出目录
- min_confidence: 最小置信度阈值
-
- 返回:
- 包含精确文字区域信息的JSON文件路径
- """
- image_path = Path(image_path)
- output_dir = Path(output_dir)
-
- # 使用Path.mkdir处理中文路径(比os.makedirs更可靠)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- print(f"📖 读取图片: {image_path.name}")
- print(f"[INFO] 图片完整路径: {image_path}")
-
- # 读取图片(处理中文路径)
- # 确保使用传入的原始图片路径(图1)进行检测
- img_array = np.fromfile(str(image_path), dtype=np.uint8)
- img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
-
- if img is None:
- raise ValueError(f"无法读取图片: {image_path}")
-
- # 确保是RGB格式
- if len(img.shape) == 2:
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
- elif img.shape[2] == 4:
- img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
- elif img.shape[2] == 3:
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-
- img_height, img_width = img.shape[:2]
- print(f"[INFO] 图片尺寸: {img_width}x{img_height}")
-
- # 初始化PaddleOCR(只使用文本检测模块)
- print("[INFO] 初始化PaddleOCR文本检测模块...")
- try:
- # 直接指定本地模型路径,避免每次检查模型
- # 使用PP-OCRv5_server模型(最好的中文模型)
- import os
- user_home = os.path.expanduser('~')
- model_base_dir = os.path.join(user_home, '.paddlex', 'official_models')
-
- text_detection_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_det')
- text_recognition_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_rec')
- textline_orientation_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_textline_ori')
- doc_orientation_classify_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_doc_ori')
- doc_unwarping_model_dir = os.path.join(model_base_dir, 'UVDoc')
-
- # 检查模型目录是否存在
- if not os.path.exists(text_detection_model_dir):
- raise FileNotFoundError(f"检测模型目录不存在: {text_detection_model_dir}")
- if not os.path.exists(text_recognition_model_dir):
- raise FileNotFoundError(f"识别模型目录不存在: {text_recognition_model_dir}")
-
- # 直接指定模型目录,避免自动下载和检查
- paddleocr_instance = PaddleOCR(
- text_detection_model_dir=text_detection_model_dir,
- text_recognition_model_dir=text_recognition_model_dir,
- textline_orientation_model_dir=textline_orientation_model_dir,
- doc_orientation_classify_model_dir=doc_orientation_classify_model_dir,
- doc_unwarping_model_dir=doc_unwarping_model_dir,
- use_textline_orientation=False, # 不使用文本行方向检测
- enable_mkldnn=False # 明确禁用 MKL-DNN/oneDNN
- )
- print("[INFO] PaddleOCR 初始化成功(使用本地模型)")
-
- except Exception as e:
- print(f"[ERROR] PaddleOCR初始化失败: {e}")
- raise
-
- # 执行文本检测
- print("[INFO] 正在检测文字区域...")
- try:
- # 使用ocr方法进行OCR(PaddleOCR会同时进行检测和识别)
- # 但我们只使用检测结果(坐标信息),忽略识别结果(文字内容)
- # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...] 或 None
- result = paddleocr_instance.ocr(str(image_path))
-
- text_blocks = []
-
- if result and len(result) > 0:
- # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...]
- # result 是一个列表,每个元素是一个检测结果
- detection_results = result[0] if isinstance(result, list) and len(result) > 0 and isinstance(result[0], list) else result
-
- if detection_results:
- for idx, item in enumerate(detection_results):
- if item is None:
- continue
-
- # 提取坐标和置信度
- coords = None
- confidence = 0.9
- text_content = ''
-
- if isinstance(item, list) and len(item) >= 2:
- # 标准格式: [[坐标点], (文字, 置信度)]
- coords = item[0] # 多边形坐标 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
-
- # 第二个元素可能是元组 (文字, 置信度) 或列表
- if isinstance(item[1], tuple) and len(item[1]) >= 2:
- text_content = item[1][0] if len(item[1]) > 0 else ''
- confidence = float(item[1][1]) if len(item[1]) > 1 else 0.9
- elif isinstance(item[1], (int, float)):
- confidence = float(item[1])
- elif isinstance(item, list) and len(item) == 1:
- # 只有坐标,没有识别结果
- coords = item[0]
- confidence = 0.9
- elif isinstance(item, dict):
- coords = item.get('dt_poly', []) or item.get('polygon', [])
- confidence = float(item.get('dt_score', item.get('confidence', 0.9)))
- text_content = item.get('text', '')
-
- # 过滤低置信度结果
- if confidence < min_confidence:
- continue
-
- if not coords or len(coords) < 4:
- continue
-
- # 确保坐标格式正确
- try:
- # 计算边界框(从多边形坐标中提取)
- x_coords = []
- y_coords = []
-
- for point in coords:
- if isinstance(point, (list, tuple)) and len(point) >= 2:
- x_coords.append(float(point[0]))
- y_coords.append(float(point[1]))
-
- if not x_coords or not y_coords:
- continue
-
- x1 = int(min(x_coords))
- y1 = int(min(y_coords))
- x2 = int(max(x_coords))
- y2 = int(max(y_coords))
-
- width = x2 - x1
- height = y2 - y1
-
- # 过滤太小的区域(可能是噪点)
- if width < 10 or height < 10:
- continue
-
- area = width * height
- center_x = (x1 + x2) / 2
- center_y = (y1 + y2) / 2
-
- text_blocks.append({
- 'block_index': len(text_blocks) + 1,
- 'order': len(text_blocks) + 1,
- 'text': text_content, # 如果有识别结果,保存文字内容
- 'bbox': {
- 'x1': x1,
- 'y1': y1,
- 'x2': x2,
- 'y2': y2,
- 'width': width,
- 'height': height,
- 'center_x': center_x,
- 'center_y': center_y
- },
- 'polygon': coords, # 保存精确的多边形坐标
- 'confidence': round(confidence, 4),
- 'center_x': round(center_x, 2),
- 'center_y': round(center_y, 2),
- 'width': width,
- 'height': height,
- 'area': area
- })
- except (TypeError, ValueError, IndexError) as e:
- print(f" [WARN] 第 {idx} 个检测结果解析失败: {e},跳过")
- continue
-
- print(f"[OK] 检测到 {len(text_blocks)} 个文字区域")
-
- # 计算总面积
- total_text_area = sum(block['area'] for block in text_blocks)
- total_image_area = img_width * img_height
- text_area_ratio = (total_text_area / total_image_area * 100) if total_image_area > 0 else 0
-
- # 生成结果JSON
- image_name = image_path.stem
- result_data = {
- 'image_file': image_name + image_path.suffix,
- 'image_size': {
- 'width': img_width,
- 'height': img_height,
- 'total_area': total_image_area
- },
- 'text_blocks': text_blocks,
- 'total_count': len(text_blocks),
- 'total_text_area': total_text_area,
- 'text_area_ratio': round(text_area_ratio, 2)
- }
-
- # 保存JSON文件
- output_json = output_dir / f"{image_name}_text_blocks.json"
- with open(output_json, 'w', encoding='utf-8') as f:
- json.dump(result_data, f, ensure_ascii=False, indent=2)
-
- print(f"[OK] 已保存检测结果: {output_json.name}")
- print(f"[INFO] 文字区域总面积: {total_text_area:.0f} 像素²")
- print(f"[INFO] 文字区域占比: {text_area_ratio:.2f}%")
-
- # 不再生成dialogues.json文件,因为后续流程只使用text_blocks.json
- # dialogues.json文件中的text字段通常是空的(只做检测不做识别),没有实际用途
-
- return str(output_json)
-
- except Exception as e:
- print(f"[ERROR] 文本检测失败: {e}")
- import traceback
- traceback.print_exc()
- raise
- if __name__ == '__main__':
- if len(sys.argv) < 3:
- print("用法: python paddleocr_text_detection.py <图片路径> <输出目录> [最小置信度]")
- sys.exit(1)
-
- # 使用绝对路径避免编码问题
- image_path = str(Path(sys.argv[1]).resolve())
- output_dir = str(Path(sys.argv[2]).resolve())
- min_confidence = float(sys.argv[3]) if len(sys.argv) > 3 else 0.5
-
- try:
- detect_text_regions(image_path, output_dir, min_confidence)
- except Exception as e:
- print(f"[ERROR] 处理失败: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
|