paddleocr_text_detection.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. # -*- coding: utf-8 -*-
  2. """
  3. 使用PaddleOCR的文本检测模块进行精确的文字区域检测
  4. 返回精确的多边形坐标和边界框信息
  5. """
  6. import sys
  7. import json
  8. import cv2
  9. import numpy as np
  10. import os
  11. from pathlib import Path
  12. # Windows编码修复
  13. if sys.platform == 'win32':
  14. import io
  15. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
  16. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
  17. # 禁用 oneDNN 以避免 NotImplementedError
  18. os.environ['FLAGS_onednn'] = '0'
  19. os.environ['FLAGS_use_mkldnn'] = '0'
  20. os.environ['FLAGS_enable_onednn_layout_fusion'] = '0'
  21. # 跳过模型源检查,加快启动速度
  22. # 注意:必须在导入 PaddleOCR 之前设置
  23. # 正确的环境变量名是 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK
  24. os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
  25. # 设置日志级别,减少不必要的日志输出
  26. import logging
  27. import warnings
  28. logging.getLogger('paddlex').setLevel(logging.WARNING)
  29. logging.getLogger('paddlex.inference').setLevel(logging.WARNING)
  30. logging.getLogger('paddlex.inference.utils').setLevel(logging.WARNING)
  31. logging.getLogger('paddlex.inference.utils.official_models').setLevel(logging.WARNING)
  32. # 抑制 pkg_resources 的弃用警告
  33. warnings.filterwarnings('ignore', category=UserWarning, message='.*pkg_resources.*')
  34. warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*pkg_resources.*')
  35. # 抑制 ccache 警告(这是 PaddlePaddle 的警告,不影响功能)
  36. warnings.filterwarnings('ignore', message='.*ccache.*')
  37. # 添加PaddleOCR路径
  38. project_root = Path(__file__).parent.parent
  39. # 尝试多个可能的路径
  40. paddleocr_paths = [
  41. project_root / 'PaddleOCR-main', # 直接路径
  42. project_root / 'PaddleOCR-main' / 'PaddleOCR-main', # 嵌套路径
  43. ]
  44. paddleocr_path = None
  45. for path in paddleocr_paths:
  46. if path.exists() and (path / 'paddleocr').exists():
  47. paddleocr_path = path
  48. break
  49. if paddleocr_path:
  50. sys.path.insert(0, str(paddleocr_path))
  51. print(f"[INFO] 使用本地PaddleOCR路径: {paddleocr_path}")
  52. else:
  53. print(f"[WARN] 未找到本地PaddleOCR,尝试使用pip安装的版本")
  54. try:
  55. from paddleocr import PaddleOCR
  56. PADDLEOCR_AVAILABLE = True
  57. except ImportError as e:
  58. print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
  59. print("[ERROR] PaddleOCR 是必需的,请确保已正确安装")
  60. PADDLEOCR_AVAILABLE = False
  61. sys.exit(1)
  62. def detect_text_regions(image_path, output_dir, min_confidence=0.5):
  63. """
  64. 使用PaddleOCR的文本检测模块检测文字区域
  65. 参数:
  66. image_path: 图片路径
  67. output_dir: 输出目录
  68. min_confidence: 最小置信度阈值
  69. 返回:
  70. 包含精确文字区域信息的JSON文件路径
  71. """
  72. image_path = Path(image_path)
  73. output_dir = Path(output_dir)
  74. # 使用Path.mkdir处理中文路径(比os.makedirs更可靠)
  75. output_dir.mkdir(parents=True, exist_ok=True)
  76. print(f"📖 读取图片: {image_path.name}")
  77. print(f"[INFO] 图片完整路径: {image_path}")
  78. # 读取图片(处理中文路径)
  79. # 确保使用传入的原始图片路径(图1)进行检测
  80. img_array = np.fromfile(str(image_path), dtype=np.uint8)
  81. img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
  82. if img is None:
  83. raise ValueError(f"无法读取图片: {image_path}")
  84. # 确保是RGB格式
  85. if len(img.shape) == 2:
  86. img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
  87. elif img.shape[2] == 4:
  88. img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
  89. elif img.shape[2] == 3:
  90. img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  91. img_height, img_width = img.shape[:2]
  92. print(f"[INFO] 图片尺寸: {img_width}x{img_height}")
  93. # 初始化PaddleOCR(只使用文本检测模块)
  94. print("[INFO] 初始化PaddleOCR文本检测模块...")
  95. try:
  96. # 直接指定本地模型路径,避免每次检查模型
  97. # 使用PP-OCRv5_server模型(最好的中文模型)
  98. import os
  99. user_home = os.path.expanduser('~')
  100. model_base_dir = os.path.join(user_home, '.paddlex', 'official_models')
  101. text_detection_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_det')
  102. text_recognition_model_dir = os.path.join(model_base_dir, 'PP-OCRv5_server_rec')
  103. textline_orientation_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_textline_ori')
  104. doc_orientation_classify_model_dir = os.path.join(model_base_dir, 'PP-LCNet_x1_0_doc_ori')
  105. doc_unwarping_model_dir = os.path.join(model_base_dir, 'UVDoc')
  106. # 检查模型目录是否存在
  107. if not os.path.exists(text_detection_model_dir):
  108. raise FileNotFoundError(f"检测模型目录不存在: {text_detection_model_dir}")
  109. if not os.path.exists(text_recognition_model_dir):
  110. raise FileNotFoundError(f"识别模型目录不存在: {text_recognition_model_dir}")
  111. # 直接指定模型目录,避免自动下载和检查
  112. paddleocr_instance = PaddleOCR(
  113. text_detection_model_dir=text_detection_model_dir,
  114. text_recognition_model_dir=text_recognition_model_dir,
  115. textline_orientation_model_dir=textline_orientation_model_dir,
  116. doc_orientation_classify_model_dir=doc_orientation_classify_model_dir,
  117. doc_unwarping_model_dir=doc_unwarping_model_dir,
  118. use_textline_orientation=False, # 不使用文本行方向检测
  119. enable_mkldnn=False # 明确禁用 MKL-DNN/oneDNN
  120. )
  121. print("[INFO] PaddleOCR 初始化成功(使用本地模型)")
  122. except Exception as e:
  123. print(f"[ERROR] PaddleOCR初始化失败: {e}")
  124. raise
  125. # 执行文本检测
  126. print("[INFO] 正在检测文字区域...")
  127. try:
  128. # 使用ocr方法进行OCR(PaddleOCR会同时进行检测和识别)
  129. # 但我们只使用检测结果(坐标信息),忽略识别结果(文字内容)
  130. # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...] 或 None
  131. result = paddleocr_instance.ocr(str(image_path))
  132. text_blocks = []
  133. if result and len(result) > 0:
  134. # PaddleOCR返回格式: [[[坐标点], (文字, 置信度)], ...]
  135. # result 是一个列表,每个元素是一个检测结果
  136. detection_results = result[0] if isinstance(result, list) and len(result) > 0 and isinstance(result[0], list) else result
  137. if detection_results:
  138. for idx, item in enumerate(detection_results):
  139. if item is None:
  140. continue
  141. # 提取坐标和置信度
  142. coords = None
  143. confidence = 0.9
  144. text_content = ''
  145. if isinstance(item, list) and len(item) >= 2:
  146. # 标准格式: [[坐标点], (文字, 置信度)]
  147. coords = item[0] # 多边形坐标 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
  148. # 第二个元素可能是元组 (文字, 置信度) 或列表
  149. if isinstance(item[1], tuple) and len(item[1]) >= 2:
  150. text_content = item[1][0] if len(item[1]) > 0 else ''
  151. confidence = float(item[1][1]) if len(item[1]) > 1 else 0.9
  152. elif isinstance(item[1], (int, float)):
  153. confidence = float(item[1])
  154. elif isinstance(item, list) and len(item) == 1:
  155. # 只有坐标,没有识别结果
  156. coords = item[0]
  157. confidence = 0.9
  158. elif isinstance(item, dict):
  159. coords = item.get('dt_poly', []) or item.get('polygon', [])
  160. confidence = float(item.get('dt_score', item.get('confidence', 0.9)))
  161. text_content = item.get('text', '')
  162. # 过滤低置信度结果
  163. if confidence < min_confidence:
  164. continue
  165. if not coords or len(coords) < 4:
  166. continue
  167. # 确保坐标格式正确
  168. try:
  169. # 计算边界框(从多边形坐标中提取)
  170. x_coords = []
  171. y_coords = []
  172. for point in coords:
  173. if isinstance(point, (list, tuple)) and len(point) >= 2:
  174. x_coords.append(float(point[0]))
  175. y_coords.append(float(point[1]))
  176. if not x_coords or not y_coords:
  177. continue
  178. x1 = int(min(x_coords))
  179. y1 = int(min(y_coords))
  180. x2 = int(max(x_coords))
  181. y2 = int(max(y_coords))
  182. width = x2 - x1
  183. height = y2 - y1
  184. # 过滤太小的区域(可能是噪点)
  185. if width < 10 or height < 10:
  186. continue
  187. area = width * height
  188. center_x = (x1 + x2) / 2
  189. center_y = (y1 + y2) / 2
  190. text_blocks.append({
  191. 'block_index': len(text_blocks) + 1,
  192. 'order': len(text_blocks) + 1,
  193. 'text': text_content, # 如果有识别结果,保存文字内容
  194. 'bbox': {
  195. 'x1': x1,
  196. 'y1': y1,
  197. 'x2': x2,
  198. 'y2': y2,
  199. 'width': width,
  200. 'height': height,
  201. 'center_x': center_x,
  202. 'center_y': center_y
  203. },
  204. 'polygon': coords, # 保存精确的多边形坐标
  205. 'confidence': round(confidence, 4),
  206. 'center_x': round(center_x, 2),
  207. 'center_y': round(center_y, 2),
  208. 'width': width,
  209. 'height': height,
  210. 'area': area
  211. })
  212. except (TypeError, ValueError, IndexError) as e:
  213. print(f" [WARN] 第 {idx} 个检测结果解析失败: {e},跳过")
  214. continue
  215. print(f"[OK] 检测到 {len(text_blocks)} 个文字区域")
  216. # 计算总面积
  217. total_text_area = sum(block['area'] for block in text_blocks)
  218. total_image_area = img_width * img_height
  219. text_area_ratio = (total_text_area / total_image_area * 100) if total_image_area > 0 else 0
  220. # 生成结果JSON
  221. image_name = image_path.stem
  222. result_data = {
  223. 'image_file': image_name + image_path.suffix,
  224. 'image_size': {
  225. 'width': img_width,
  226. 'height': img_height,
  227. 'total_area': total_image_area
  228. },
  229. 'text_blocks': text_blocks,
  230. 'total_count': len(text_blocks),
  231. 'total_text_area': total_text_area,
  232. 'text_area_ratio': round(text_area_ratio, 2)
  233. }
  234. # 保存JSON文件
  235. output_json = output_dir / f"{image_name}_text_blocks.json"
  236. with open(output_json, 'w', encoding='utf-8') as f:
  237. json.dump(result_data, f, ensure_ascii=False, indent=2)
  238. print(f"[OK] 已保存检测结果: {output_json.name}")
  239. print(f"[INFO] 文字区域总面积: {total_text_area:.0f} 像素²")
  240. print(f"[INFO] 文字区域占比: {text_area_ratio:.2f}%")
  241. # 不再生成dialogues.json文件,因为后续流程只使用text_blocks.json
  242. # dialogues.json文件中的text字段通常是空的(只做检测不做识别),没有实际用途
  243. return str(output_json)
  244. except Exception as e:
  245. print(f"[ERROR] 文本检测失败: {e}")
  246. import traceback
  247. traceback.print_exc()
  248. raise
  249. if __name__ == '__main__':
  250. if len(sys.argv) < 3:
  251. print("用法: python paddleocr_text_detection.py <图片路径> <输出目录> [最小置信度]")
  252. sys.exit(1)
  253. # 使用绝对路径避免编码问题
  254. image_path = str(Path(sys.argv[1]).resolve())
  255. output_dir = str(Path(sys.argv[2]).resolve())
  256. min_confidence = float(sys.argv[3]) if len(sys.argv) > 3 else 0.5
  257. try:
  258. detect_text_regions(image_path, output_dir, min_confidence)
  259. except Exception as e:
  260. print(f"[ERROR] 处理失败: {e}")
  261. import traceback
  262. traceback.print_exc()
  263. sys.exit(1)