| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300 |
- # -*- coding: utf-8 -*-
- """
- 使用PaddleOCR识别图片中的文字
- """
- import sys
- import json
- import cv2
- import numpy as np
- import os
- from pathlib import Path
- # ========== 必须在所有导入之前设置环境变量 ==========
- # 跳过模型源检查,加快启动速度(必须在导入 PaddleOCR 之前设置)
- # 注意:正确的环境变量名是 PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK
- # 设置为 'True' 会跳过连接检查,设置为 'False' 或不设置会进行连接检查
- # 由于我们已经直接指定了本地模型路径,可以禁用这个检查
- os.environ['PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK'] = 'True'
- # 禁用 oneDNN 以避免 NotImplementedError(PaddlePaddle 3.3.0 的已知问题)
- # 必须在导入 PaddlePaddle 之前设置
- os.environ['FLAGS_onednn'] = '0'
- os.environ['FLAGS_use_mkldnn'] = '0'
- os.environ['FLAGS_enable_onednn_layout_fusion'] = '0'
- os.environ['FLAGS_use_mkldnn'] = 'false'
- os.environ['FLAGS_onednn'] = 'false'
- # 禁用 oneDNN 的更多选项
- os.environ['FLAGS_use_mkldnn'] = 'OFF'
- os.environ['FLAGS_onednn'] = 'OFF'
- # 设置日志级别,减少不必要的日志输出
- # 注意:必须在导入 logging 相关模块之前设置
- import logging
- import warnings
- # 设置 paddlex 的日志级别为 WARNING,减少不必要的日志输出
- logging.getLogger('paddlex').setLevel(logging.WARNING)
- logging.getLogger('paddlex.inference').setLevel(logging.WARNING)
- logging.getLogger('paddlex.inference.utils').setLevel(logging.WARNING)
- logging.getLogger('paddlex.inference.utils.official_models').setLevel(logging.WARNING)
- # 抑制 pkg_resources 的弃用警告
- warnings.filterwarnings('ignore', category=UserWarning, message='.*pkg_resources.*')
- warnings.filterwarnings('ignore', category=DeprecationWarning, message='.*pkg_resources.*')
- # 抑制 ccache 警告(这是 PaddlePaddle 的警告,不影响功能)
- warnings.filterwarnings('ignore', message='.*ccache.*')
- # ==================================================
- # Windows编码修复
- if sys.platform == 'win32':
- import io
- sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
- sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
- # 添加PaddleOCR路径
- project_root = Path(__file__).parent.parent
- # 尝试多个可能的路径
- paddleocr_paths = [
- project_root / 'PaddleOCR-main', # 直接路径
- project_root / 'PaddleOCR-main' / 'PaddleOCR-main', # 嵌套路径
- ]
- paddleocr_path = None
- for path in paddleocr_paths:
- if path.exists() and (path / 'paddleocr').exists():
- paddleocr_path = path
- break
- if paddleocr_path:
- sys.path.insert(0, str(paddleocr_path))
- print(f"[INFO] 使用本地PaddleOCR路径: {paddleocr_path}")
- else:
- print(f"[WARN] 未找到本地PaddleOCR,尝试使用pip安装的版本")
- try:
- from paddleocr import PaddleOCR
- PADDLEOCR_AVAILABLE = True
- except ImportError as e:
- print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
- print("[ERROR] PaddleOCR 是必需的,请确保已正确安装")
- PADDLEOCR_AVAILABLE = False
- sys.exit(1)
- def ocr_with_paddleocr(image_path, text_mask_path, output_dir):
- """
- 使用PaddleOCR识别图片中的文字
-
- 参数:
- image_path: 原始图片路径
- text_mask_path: 文字遮罩图路径(用于参考,可选,当前未使用)
- output_dir: 输出目录
- """
- image_path = Path(image_path)
- # 处理空字符串的情况
- text_mask_path = Path(text_mask_path) if text_mask_path and text_mask_path.strip() else None
- output_dir = Path(output_dir)
-
- # 使用 Path.mkdir 处理中文路径,比 os.makedirs 更可靠
- output_dir.mkdir(parents=True, exist_ok=True)
-
- print(f"📖 读取原始图片: {image_path.name}")
-
- # 读取原始图片(处理中文路径)
- img_array = np.fromfile(str(image_path), dtype=np.uint8)
- img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
-
- if img is None:
- raise ValueError(f"无法读取图片: {image_path}")
-
- # 确保是RGB格式(3通道)
- if len(img.shape) == 2:
- # 如果是灰度图,转换为RGB
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
- elif img.shape[2] == 4:
- # 如果是RGBA,转换为RGB
- img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
- elif img.shape[2] == 3:
- # BGR转RGB
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-
- img_height, img_width = img.shape[:2]
- print(f"[INFO] 图片尺寸: {img_width}x{img_height}")
-
- # 初始化PaddleOCR
- print("[INFO] 初始化PaddleOCR...")
- try:
- # 使用简单的初始化方式,让 PaddleOCR 自动使用本地已下载的模型
- # 这样更稳定,避免直接指定模型路径可能导致的初始化问题
- paddleocr_instance = PaddleOCR(
- lang='ch', # 中文
- enable_mkldnn=False # 明确禁用 MKL-DNN/oneDNN
- )
- print("[INFO] PaddleOCR 初始化成功")
- except Exception as e:
- print(f"[ERROR] PaddleOCR初始化失败: {e}")
- import traceback
- traceback.print_exc()
- raise RuntimeError(f"PaddleOCR 初始化失败: {e}")
-
- # 执行OCR识别
- print("[INFO] 正在识别文字...")
- print(f"[DEBUG] 图片数组信息: shape={img.shape}, dtype={img.dtype}, min={img.min()}, max={img.max()}")
- try:
- # 使用已读取的图片数组,传递给 PaddleOCR(避免中文路径问题)
- print(f"[DEBUG] 准备调用 paddleocr_instance.predict...")
- import sys
- sys.stdout.flush() # 确保输出被刷新
- ocr_result = paddleocr_instance.predict(img)
- print(f"[DEBUG] OCR结果类型: {type(ocr_result)}, 长度: {len(ocr_result) if ocr_result else 0}")
- sys.stdout.flush()
- except Exception as e:
- print(f"[ERROR] OCR识别失败: {e}")
- import traceback
- traceback.print_exc()
- sys.stdout.flush()
- raise
-
- if not ocr_result or len(ocr_result) == 0:
- print("[WARN] 未识别到任何文字")
- dialogues = []
- else:
- # 解析PaddleOCR结果
- result_item = ocr_result[0]
-
- # PaddleOCR 3.x 返回的是 OCRResult 对象,通过 .json 属性获取数据
- try:
- result_json = result_item.json
- res_data = result_json.get('res', {}) if isinstance(result_json, dict) else {}
-
- # 提取文本、置信度、坐标
- rec_texts = res_data.get('rec_texts', [])
- rec_scores = res_data.get('rec_scores', [])
- rec_polys = res_data.get('rec_polys', []) # 多边形坐标 [[[x1,y1],[x2,y2],[x3,y3],[x4,y4]], ...]
- rec_boxes = res_data.get('rec_boxes', []) # 边界框 [[x1,y1,x2,y2], ...]
-
- print(f"[OK] 识别到 {len(rec_texts)} 个文本区域")
-
- # 提取对话文本
- dialogues = []
- for idx, text in enumerate(rec_texts):
- if not text or not text.strip():
- continue
-
- # 获取置信度
- confidence = float(rec_scores[idx]) if idx < len(rec_scores) else 0.9
-
- # 获取坐标(优先使用多边形坐标,如果没有则使用边界框)
- if idx < len(rec_polys) and rec_polys[idx]:
- bbox_coords = rec_polys[idx] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
- elif idx < len(rec_boxes) and rec_boxes[idx]:
- # 将边界框转换为多边形格式
- box = rec_boxes[idx] # [x1, y1, x2, y2]
- bbox_coords = [
- [box[0], box[1]], # 左上
- [box[2], box[1]], # 右上
- [box[2], box[3]], # 右下
- [box[0], box[3]] # 左下
- ]
- else:
- print(f" [WARN] 第 {idx} 个文本没有坐标信息,跳过")
- continue
-
- # 计算边界框
- if not isinstance(bbox_coords, (list, tuple)) or len(bbox_coords) < 4:
- print(f" [WARN] 第 {idx} 个文本坐标格式不正确,跳过")
- continue
-
- try:
- x_coords = []
- y_coords = []
- for coord in bbox_coords:
- if isinstance(coord, (list, tuple)) and len(coord) >= 2:
- x_coords.append(coord[0])
- y_coords.append(coord[1])
-
- if not x_coords or not y_coords or len(x_coords) < 4:
- print(f" [WARN] 第 {idx} 个文本无法提取足够的坐标点,跳过")
- continue
-
- x1 = int(min(x_coords))
- y1 = int(min(y_coords))
- x2 = int(max(x_coords))
- y2 = int(max(y_coords))
-
- dialogues.append({
- 'order': len(dialogues) + 1,
- 'text': text.strip(),
- 'bbox': {
- 'x1': x1,
- 'y1': y1,
- 'x2': x2,
- 'y2': y2,
- 'width': x2 - x1,
- 'height': y2 - y1,
- 'center_x': float((x1 + x2) / 2),
- 'center_y': float((y1 + y2) / 2)
- },
- 'confidence': confidence
- })
- print(f" [{len(dialogues)}/{len(rec_texts)}] {text[:50]}...")
- except (TypeError, IndexError, ValueError) as e:
- print(f" [WARN] 第 {idx} 个文本解析坐标失败: {e},跳过")
- continue
-
- except Exception as e:
- print(f"[ERROR] 解析PaddleOCR结果失败: {e}")
- import traceback
- traceback.print_exc()
- dialogues = []
-
- # 保存结果
- image_name = image_path.stem
- output_json = {
- 'image_file': f"{image_name}{image_path.suffix}",
- 'reading_order': '从右到左、从上到下(日式漫画阅读顺序)',
- 'dialogues': dialogues,
- 'total_count': len(dialogues)
- }
-
- output_file = output_dir / f"{image_name}_dialogues.json"
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(output_json, f, ensure_ascii=False, indent=2)
-
- print(f"\n✅ 结果已保存到: {output_file}")
- return output_file
- if __name__ == '__main__':
- try:
- print(f"[DEBUG] sys.argv: {sys.argv}")
- print(f"[DEBUG] sys.argv长度: {len(sys.argv)}")
- # 至少需要3个参数:脚本名、图片路径、输出目录
- # text_mask_path 是可选的,可以为空(空字符串会被shell忽略)
- if len(sys.argv) < 3:
- print("用法: python ocr_with_paddleocr.py <原始图片路径> [文字遮罩图路径] <输出目录>")
- sys.exit(1)
-
- image_path = sys.argv[1]
- # 如果只有3个参数,说明没有 text_mask_path(空字符串被忽略),output_dir 是第二个参数
- if len(sys.argv) == 3:
- text_mask_path = ""
- output_dir = sys.argv[2]
- elif len(sys.argv) >= 4:
- # 有4个或更多参数:脚本名、图片路径、text_mask_path、输出目录
- text_mask_path = sys.argv[2]
- output_dir = sys.argv[3]
- else:
- # 不应该到这里
- raise ValueError("参数数量不正确")
-
- print(f"[DEBUG] 参数: image_path={image_path}, text_mask_path={text_mask_path}, output_dir={output_dir}")
-
- # 验证图片路径是否存在
- if not Path(image_path).exists():
- raise FileNotFoundError(f"图片文件不存在: {image_path}")
-
- ocr_with_paddleocr(image_path, text_mask_path, output_dir)
- except KeyboardInterrupt:
- print("[INFO] 用户中断")
- sys.exit(1)
- except Exception as e:
- print(f"[ERROR] OCR识别失败: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
|