# -*- coding: utf-8 -*- """ 使用 comic-text-detector 检测文字区域,然后用 PaddleOCR 识别文字内容 按日式漫画阅读顺序(从右到左、从上到下)排序 """ import sys import os import json from pathlib import Path import cv2 import numpy as np def convert_coordinate_to_math_system(x, y, image_height): """ 将OpenCV坐标系转换为数学坐标系 原始坐标系(OpenCV):左上角(0,0),向下为y轴正方向,向右为x轴正方向 目标坐标系(数学):左下角(0,0),向上为y轴正方向,向右为x轴正方向 转换公式: - x_new = x_old (x坐标不变) - y_new = imageHeight - y_old (y坐标翻转) 参数: x: 原始x坐标 y: 原始y坐标 image_height: 图片高度 返回: (x_new, y_new): 转换后的坐标 """ x_new = x y_new = image_height - y return x_new, y_new def detect_characters_with_opencv(img, text_bbox, text_content, ocr_bbox_hint=None): """ 使用OpenCV在文本区域内精确定位每个字符 参数: img: 原始图像(BGR格式) text_bbox: 文本边界框,格式 {'x1': int, 'y1': int, 'x2': int, 'y2': int} text_content: 文本内容(用于验证字符数量) ocr_bbox_hint: OCR提供的文本边界框(可选,用于辅助识别) 返回: char_boxes: 字符边界框列表,每个元素包含 {'x1', 'y1', 'x2', 'y2', 'center_x', 'center_y'} 确保字符数量与OCR文本一致,且字符框不重叠 """ # 提取文本区域 x1 = int(text_bbox['x1']) y1 = int(text_bbox['y1']) x2 = int(text_bbox['x2']) y2 = int(text_bbox['y2']) # 确保坐标在图像范围内 h, w = img.shape[:2] x1 = max(0, x1) y1 = max(0, y1) x2 = min(w, x2) y2 = min(h, y2) if x2 <= x1 or y2 <= y1: return [] # 提取文本区域ROI text_roi = img[y1:y2, x1:x2].copy() if text_roi.size == 0: return [] # 转换为灰度图 if len(text_roi.shape) == 3: gray_roi = cv2.cvtColor(text_roi, cv2.COLOR_BGR2GRAY) else: gray_roi = text_roi # 二值化处理 # 使用自适应阈值,因为文本区域可能有不同的光照条件 binary = cv2.adaptiveThreshold( gray_roi, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2 ) # 形态学操作:去除噪点,连接字符笔画 kernel = np.ones((2, 2), np.uint8) binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1) binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel, iterations=1) # 判断是竖排还是横排 roi_height, roi_width = binary.shape is_vertical = roi_height > roi_width * 1.2 # 保存布局信息,用于后续估算 layout_info = {'is_vertical': is_vertical, 'roi_width': roi_width, 'roi_height': roi_height} char_boxes = [] if is_vertical: # 竖排文字:使用垂直投影来分割字符 # 计算垂直投影(每列的白色像素数量) vertical_projection = np.sum(binary, axis=0) # 找到字符之间的空白列(投影值接近0) threshold = np.max(vertical_projection) * 0.1 char_boundaries = [] in_char = False start_col = 0 for col in range(len(vertical_projection)): if vertical_projection[col] > threshold: if not in_char: in_char = True start_col = col else: if in_char: in_char = False # 字符结束位置(使用中间位置作为分割点) end_col = col char_boundaries.append((start_col, end_col)) # 处理最后一个字符 if in_char: char_boundaries.append((start_col, len(vertical_projection))) # 为每个字符区域计算水平边界 for start_col, end_col in char_boundaries: char_col_roi = binary[:, start_col:end_col] horizontal_projection = np.sum(char_col_roi, axis=1) # 找到字符的上下边界 char_rows = np.where(horizontal_projection > 0)[0] if len(char_rows) > 0: top_row = char_rows[0] bottom_row = char_rows[-1] # 转换为原图坐标 char_x1 = x1 + start_col char_y1 = y1 + top_row char_x2 = x1 + end_col char_y2 = y1 + bottom_row char_boxes.append({ 'x1': float(char_x1), 'y1': float(char_y1), 'x2': float(char_x2), 'y2': float(char_y2), 'center_x': float((char_x1 + char_x2) / 2), 'center_y': float((char_y1 + char_y2) / 2) }) else: # 横排文字:使用水平投影来分割字符 # 计算水平投影(每行的白色像素数量) horizontal_projection = np.sum(binary, axis=1) # 找到字符之间的空白行(投影值接近0) threshold = np.max(horizontal_projection) * 0.1 char_boundaries = [] in_char = False start_row = 0 for row in range(len(horizontal_projection)): if horizontal_projection[row] > threshold: if not in_char: in_char = True start_row = row else: if in_char: in_char = False # 字符结束位置 end_row = row char_boundaries.append((start_row, end_row)) # 处理最后一个字符 if in_char: char_boundaries.append((start_row, len(horizontal_projection))) # 为每个字符区域计算垂直边界 for start_row, end_row in char_boundaries: char_row_roi = binary[start_row:end_row, :] vertical_projection = np.sum(char_row_roi, axis=0) # 找到字符的左右边界 char_cols = np.where(vertical_projection > 0)[0] if len(char_cols) > 0: left_col = char_cols[0] right_col = char_cols[-1] # 转换为原图坐标 char_x1 = x1 + left_col char_y1 = y1 + start_row char_x2 = x1 + right_col char_y2 = y1 + end_row char_boxes.append({ 'x1': float(char_x1), 'y1': float(char_y1), 'x2': float(char_x2), 'y2': float(char_y2), 'center_x': float((char_x1 + char_x2) / 2), 'center_y': float((char_y1 + char_y2) / 2) }) # 如果投影方法检测到的字符数量与文本内容不匹配,使用轮廓检测作为主要方法 text_no_space = text_content.replace(' ', '') expected_char_count = len(text_no_space) # 使用轮廓检测作为主要方法(更精确) contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # 过滤掉太小的轮廓(可能是噪点) min_area = (roi_width * roi_height) / (expected_char_count * 20) # 更严格的面积阈值 max_area = (roi_width * roi_height) / 2 # 最大面积(避免误检测) valid_contours = [] for contour in contours: area = cv2.contourArea(contour) if min_area < area < max_area: # 计算轮廓的宽高比,过滤掉明显不是字符的轮廓 x, y, w, h = cv2.boundingRect(contour) aspect_ratio = h / w if w > 0 else 0 # 字符的宽高比通常在合理范围内 if 0.2 < aspect_ratio < 5.0: valid_contours.append(contour) # 如果轮廓数量合理,使用轮廓结果(更精确) if len(valid_contours) > 0: # 按位置排序轮廓(从上到下、从右到左) contour_boxes = [] for contour in valid_contours: x, y, w, h = cv2.boundingRect(contour) # 转换为原图坐标 char_x1 = float(x1 + x) char_y1 = float(y1 + y) char_x2 = float(x1 + x + w) char_y2 = float(y1 + y + h) contour_boxes.append({ 'x1': char_x1, 'y1': char_y1, 'x2': char_x2, 'y2': char_y2, 'center_x': float(char_x1 + w / 2), 'center_y': float(char_y1 + h / 2), 'area': area }) # 按位置排序(从上到下、从右到左) contour_boxes.sort(key=lambda b: (b['y1'], -b['center_x'])) # 如果轮廓数量接近预期,使用轮廓结果 if abs(len(contour_boxes) - expected_char_count) <= abs(len(char_boxes) - expected_char_count): char_boxes = contour_boxes # 关键改进:确保字符数量与OCR文本一致,且字符框不重叠 # 如果识别出的字符数量不匹配,使用OCR坐标作为参考来辅助识别 if len(char_boxes) != expected_char_count and ocr_bbox_hint: # 使用OCR提供的边界框作为参考,估算字符位置 char_boxes = refine_char_boxes_with_ocr_hint( img, text_bbox, text_content, char_boxes, ocr_bbox_hint, expected_char_count ) # 确保字符框不重叠 char_boxes = remove_overlapping_boxes(char_boxes, expected_char_count) # 如果字符数量仍然不匹配,使用估算方法 if len(char_boxes) != expected_char_count: char_boxes = estimate_char_boxes_from_text_bbox( text_bbox, text_content, expected_char_count, is_vertical ) return char_boxes def refine_char_boxes_with_ocr_hint(img, text_bbox, text_content, detected_boxes, ocr_bbox_hint, expected_count): """ 使用OCR提供的边界框作为参考,改进字符检测 参数: img: 原始图像 text_bbox: 文本边界框 text_content: 文本内容 detected_boxes: 已检测到的字符框列表 ocr_bbox_hint: OCR提供的文本边界框 expected_count: 期望的字符数量 返回: 改进后的字符框列表 """ # 如果已检测到的字符框数量接近期望值,直接返回 if abs(len(detected_boxes) - expected_count) <= 2: return detected_boxes # 使用OCR边界框估算字符位置 text_no_space = text_content.replace(' ', '') roi_width = text_bbox['x2'] - text_bbox['x1'] roi_height = text_bbox['y2'] - text_bbox['y1'] is_vertical = roi_height > roi_width * 1.2 estimated_boxes = [] if is_vertical: # 竖排:估算每个字符的位置 # 估算列数和行数 estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8))) estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols char_width = roi_width / estimated_cols char_height = roi_height / estimated_rows # 如果有已检测到的字符框,使用它们的位置来调整估算 if len(detected_boxes) > 0: # 使用已检测到的字符框位置来调整估算 for i in range(expected_count): col = i % estimated_cols row = i // estimated_cols est_x = text_bbox['x1'] + col * char_width + char_width / 2 est_y = text_bbox['y1'] + row * char_height + char_height / 2 # 找到最近的已检测字符框 min_dist = float('inf') best_box = None for box in detected_boxes: dist = abs(box['center_x'] - est_x) + abs(box['center_y'] - est_y) if dist < min_dist: min_dist = dist best_box = box if best_box and min_dist < char_width: # 使用已检测到的字符框 estimated_boxes.append(best_box) else: # 使用估算位置 estimated_boxes.append({ 'x1': float(est_x - char_width / 2), 'y1': float(est_y - char_height / 2), 'x2': float(est_x + char_width / 2), 'y2': float(est_y + char_height / 2), 'center_x': float(est_x), 'center_y': float(est_y) }) else: # 完全使用估算 for i in range(expected_count): col = i % estimated_cols row = i // estimated_cols est_x = text_bbox['x1'] + col * char_width + char_width / 2 est_y = text_bbox['y1'] + row * char_height + char_height / 2 estimated_boxes.append({ 'x1': float(est_x - char_width / 2), 'y1': float(est_y - char_height / 2), 'x2': float(est_x + char_width / 2), 'y2': float(est_y + char_height / 2), 'center_x': float(est_x), 'center_y': float(est_y) }) else: # 横排:估算每个字符的位置 char_width = roi_width / expected_count char_height = roi_height for i in range(expected_count): x = text_bbox['x1'] + i * char_width + char_width / 2 y = text_bbox['y1'] + roi_height / 2 estimated_boxes.append({ 'x1': float(x - char_width / 2), 'y1': float(y - char_height / 2), 'x2': float(x + char_width / 2), 'y2': float(y + char_height / 2), 'center_x': float(x), 'center_y': float(y) }) return estimated_boxes[:expected_count] def remove_overlapping_boxes(char_boxes, expected_count): """ 移除重叠的字符框,确保字符框不重叠 参数: char_boxes: 字符框列表 expected_count: 期望的字符数量 返回: 去重后的字符框列表 """ if len(char_boxes) <= expected_count: return char_boxes # 按位置排序 sorted_boxes = sorted(char_boxes, key=lambda b: (b['y1'], b['center_x'])) # 移除重叠的字符框 non_overlapping = [] for box in sorted_boxes: is_overlapping = False for existing_box in non_overlapping: # 计算重叠面积 overlap_x1 = max(box['x1'], existing_box['x1']) overlap_y1 = max(box['y1'], existing_box['y1']) overlap_x2 = min(box['x2'], existing_box['x2']) overlap_y2 = min(box['y2'], existing_box['y2']) if overlap_x2 > overlap_x1 and overlap_y2 > overlap_y1: overlap_area = (overlap_x2 - overlap_x1) * (overlap_y2 - overlap_y1) box_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1']) existing_area = (existing_box['x2'] - existing_box['x1']) * (existing_box['y2'] - existing_box['y1']) # 如果重叠面积超过较小框的50%,认为是重叠 if overlap_area > min(box_area, existing_area) * 0.5: is_overlapping = True break if not is_overlapping: non_overlapping.append(box) # 如果去重后数量不足,尝试合并相近的字符框 if len(non_overlapping) < expected_count: # 按位置分组,合并相近的字符框 grouped = [] for box in sorted_boxes: added = False for group in grouped: # 检查是否与组内任何框相近 for group_box in group: dist = abs(box['center_x'] - group_box['center_x']) + abs(box['center_y'] - group_box['center_y']) if dist < 20: # 如果距离小于20像素,认为是同一个字符 group.append(box) added = True break if added: break if not added: grouped.append([box]) # 对每个组,选择最大的字符框 non_overlapping = [] for group in grouped: largest = max(group, key=lambda b: (b['x2'] - b['x1']) * (b['y2'] - b['y1'])) non_overlapping.append(largest) return non_overlapping[:expected_count] def estimate_char_boxes_from_text_bbox(text_bbox, text_content, expected_count, is_vertical): """ 从文本边界框估算字符位置(当OpenCV检测失败时使用) 参数: text_bbox: 文本边界框 text_content: 文本内容 expected_count: 期望的字符数量 is_vertical: 是否为竖排 返回: 估算的字符框列表 """ text_no_space = text_content.replace(' ', '') roi_width = text_bbox['x2'] - text_bbox['x1'] roi_height = text_bbox['y2'] - text_bbox['y1'] estimated_boxes = [] if is_vertical: # 竖排:估算每个字符的位置 # 估算列数和行数 estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8))) estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols char_width = roi_width / estimated_cols char_height = roi_height / estimated_rows for i in range(expected_count): col = i % estimated_cols row = i // estimated_cols x = text_bbox['x1'] + col * char_width + char_width / 2 y = text_bbox['y1'] + row * char_height + char_height / 2 estimated_boxes.append({ 'x1': float(x - char_width / 2), 'y1': float(y - char_height / 2), 'x2': float(x + char_width / 2), 'y2': float(y + char_height / 2), 'center_x': float(x), 'center_y': float(y) }) else: # 横排:估算每个字符的位置 char_width = roi_width / expected_count char_height = roi_height for i in range(expected_count): x = text_bbox['x1'] + i * char_width + char_width / 2 y = text_bbox['y1'] + roi_height / 2 estimated_boxes.append({ 'x1': float(x - char_width / 2), 'y1': float(y - char_height / 2), 'x2': float(x + char_width / 2), 'y2': float(y + char_height / 2), 'center_x': float(x), 'center_y': float(y) }) return estimated_boxes # 禁用 oneDNN 以避免 NotImplementedError(PaddlePaddle 3.3.0 的已知问题) os.environ['FLAGS_onednn'] = '0' os.environ['FLAGS_use_mkldnn'] = '0' # Windows编码修复 if sys.platform == 'win32': import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') # 添加comic-text-detector路径 project_root = Path(__file__).parent.parent comic_detector_path = project_root / 'comic-text-detector-master' / 'comic-text-detector-master' sys.path.insert(0, str(comic_detector_path)) # 添加OnnxOCR本地路径(作为回退选项) onnxocr_path = project_root / 'OnnxOCR-main' / 'OnnxOCR-main' if onnxocr_path.exists(): sys.path.insert(0, str(onnxocr_path)) # 处理 wandb 可选依赖(comic-text-detector 需要但推理时不需要) try: import wandb except ImportError: # 创建一个假的 wandb 模块,避免导入错误 class FakeWandb: @staticmethod def init(*args, **kwargs): return None @staticmethod def log(*args, **kwargs): pass @staticmethod def log_model(*args, **kwargs): pass sys.modules['wandb'] = FakeWandb() try: from inference import TextDetector, REFINEMASK_ANNOTATION from utils.io_utils import imread, imwrite except ImportError as e: print(f"[ERROR] 无法导入comic-text-detector模块: {e}") print(f"[INFO] 请确保已安装依赖: pip install torch torchvision opencv-python numpy tqdm") import traceback traceback.print_exc() sys.exit(1) # PaddleOCR(唯一使用) try: # 添加PaddleOCR路径 paddleocr_path = project_root / 'PaddleOCR-main' / 'PaddleOCR-main' if paddleocr_path.exists(): sys.path.insert(0, str(paddleocr_path)) from paddleocr import PaddleOCR PADDLEOCR_AVAILABLE = True print("[INFO] PaddleOCR 可用") except ImportError as e: print(f"[ERROR] 无法导入PaddleOCR模块: {e}") print("[ERROR] PaddleOCR 是必需的,请确保已正确安装") PADDLEOCR_AVAILABLE = False # 格子识别代码已移动到 python/generate-anim/detect_panels.py # 通过导入使用 try: # 添加当前目录到路径,以便导入同目录下的模块 import sys current_dir = Path(__file__).parent if str(current_dir) not in sys.path: sys.path.insert(0, str(current_dir)) from detect_panels import detect_comic_panels, merge_panel_mask_with_text_mask except ImportError as e: print(f"[WARN] 无法导入detect_panels模块,使用本地实现: {e}") # 如果导入失败,使用本地实现(向后兼容) def detect_comic_panels(img): """使用opencv检测漫画格子(分镜框)- 本地实现""" if len(img.shape) == 3: gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) else: gray = img.copy() panel_mask = np.zeros_like(gray) edges = cv2.Canny(gray, 50, 150, apertureSize=3) horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)) horizontal_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, horizontal_kernel) vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40)) vertical_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, vertical_kernel) lines_mask = cv2.bitwise_or(horizontal_lines, vertical_lines) lines = cv2.HoughLinesP(lines_mask, 1, np.pi/180, threshold=100, minLineLength=50, maxLineGap=10) if lines is not None: for line in lines: x1, y1, x2, y2 = line[0] cv2.line(panel_mask, (x1, y1), (x2, y2), 255, 2) kernel = np.ones((3, 3), np.uint8) dilated = cv2.dilate(lines_mask, kernel, iterations=2) contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) panels = [] for contour in contours: x, y, w, h = cv2.boundingRect(contour) area = w * h if area > img.shape[0] * img.shape[1] * 0.01: cv2.rectangle(panel_mask, (x, y), (x + w, y + h), 255, 2) panels.append({ 'x': x, 'y': y, 'width': w, 'height': h, 'center_x': x + w / 2, 'center_y': y + h / 2 }) return panel_mask, panels def merge_panel_mask_with_text_mask(panel_mask, text_mask): """合并格子遮罩图和文字mask图""" if panel_mask.shape != text_mask.shape: panel_mask = cv2.resize(panel_mask, (text_mask.shape[1], text_mask.shape[0])) return np.maximum(panel_mask, text_mask) def get_text_block_panel(text_block, panels): """ 判断文字块属于哪个格子 参数: text_block: 文字块,包含bbox信息 panels: 格子列表 返回: panel_index: 格子索引,如果不在任何格子内返回-1 """ bbox = text_block['bbox'] center_x = (bbox['x1'] + bbox['x2']) / 2 center_y = (bbox['y1'] + bbox['y2']) / 2 for i, panel in enumerate(panels): if (panel['x'] <= center_x <= panel['x'] + panel['width'] and panel['y'] <= center_y <= panel['y'] + panel['height']): return i return -1 def sort_text_blocks_by_panels(text_blocks, panels, image_width, image_height): """ 按日式漫画阅读顺序排序:从右到左、从上到下(竖着读取) 排序规则: 1. 先按列分组(从右到左)- 越往右的列越靠前 2. 同一列内,按行排序(从上到下)- 越往上的行越靠前 3. 同一格子内,按X坐标从右到左 参数: text_blocks: 文字块列表,每个包含bbox信息 panels: 格子列表 image_width: 图片宽度 image_height: 图片高度 返回: 排序后的文字块列表 """ if not text_blocks: return [] # 计算每个文字块的中心点和所属格子 for block in text_blocks: bbox = block['bbox'] block['center_x'] = (bbox['x1'] + bbox['x2']) / 2 block['center_y'] = (bbox['y1'] + bbox['y2']) / 2 block['panel_index'] = get_text_block_panel(block, panels) # 排序规则(日式漫画:从右到左、从上到下竖着读取): # 1. 先按X坐标分组(从右到左)- X坐标越大(越靠右)越靠前 # 2. 同一列内,按Y坐标排序(从上到下)- Y坐标越小(越往上)越靠前 # 3. 同一位置,按X坐标从右到左 # 将图片分成列(从右到左) # 使用图片宽度的20%作为列的分组阈值(更宽松的分组) column_threshold = max(image_width * 0.2, 100) # 至少100像素 def sort_key(block): # 直接使用文字块的中心坐标,不依赖格子 center_x = block['center_x'] center_y = block['center_y'] # 计算列号(从右到左,列号越小越靠右) # 将X坐标转换为列号:X坐标越大,列号越小(越靠右) # 使用 image_width - center_x 来计算距离右边的距离 distance_from_right = image_width - center_x column = int(distance_from_right / column_threshold) # 使用列号和Y坐标作为主要排序依据 # 列号越小(越靠右)越靠前,Y坐标越小(越往上)越靠前 # 同一列同一行内,X坐标越大(越靠右)越靠前 return (column, center_y, -center_x) sorted_blocks = sorted(text_blocks, key=sort_key) return sorted_blocks def detect_and_ocr_comic(image_path, model_path=None, output_dir=None): """ 检测漫画文字区域并用OCR识别 参数: image_path: 图片路径 model_path: comic-text-detector模型路径 output_dir: 输出目录 """ image_path = Path(image_path) if not image_path.exists(): raise FileNotFoundError(f"图片文件不存在: {image_path}") print(f"📖 正在处理图片: {image_path.name}") # 设置模型路径 if model_path is None: possible_paths = [ comic_detector_path / 'data' / 'comictextdetector.pt', comic_detector_path / 'data' / 'comictextdetector.pt.onnx', ] model_path = None for path in possible_paths: if path.exists(): model_path = path break if model_path is None: raise FileNotFoundError( f"未找到comic-text-detector模型文件。请下载模型并放到以下位置之一:\n" + "\n".join([f" - {p}" for p in possible_paths]) ) # 设置输出目录 if output_dir is None: output_dir = image_path.parent else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # 创建tmp子目录用于保存中间处理文件 tmp_dir = output_dir / 'tmp' tmp_dir.mkdir(parents=True, exist_ok=True) # 初始化comic-text-detector device = 'cuda' if __import__('torch').cuda.is_available() else 'cpu' print(f"[INFO] 使用设备: {device}") try: detector = TextDetector( model_path=str(model_path), input_size=1024, device=device, act='leaky' ) except Exception as e: print(f"[ERROR] 初始化检测器失败: {e}") raise # 初始化PaddleOCR(唯一使用) print("[INFO] 初始化PaddleOCR...") ocr_engine = None paddleocr_instance = None if not PADDLEOCR_AVAILABLE: raise RuntimeError("PaddleOCR 不可用,请确保已正确安装 paddlex[ocr-core]") try: # 初始化PaddleOCR,使用中文模型 # enable_mkldnn=False 禁用 MKL-DNN 以避免 NotImplementedError # use_angle_cls=True 启用角度分类器,可以更好地识别竖排文字 paddleocr_instance = PaddleOCR( use_angle_cls=True, # 启用角度分类器,支持竖排文字识别 lang='ch', # 中文 enable_mkldnn=False # 禁用 MKL-DNN 以避免 oneDNN 错误 ) ocr_engine = 'paddleocr' print("[INFO] PaddleOCR 初始化成功") except Exception as e: print(f"[ERROR] PaddleOCR初始化失败: {e}") raise RuntimeError(f"PaddleOCR 初始化失败: {e}") # 读取图片 img = imread(str(image_path)) if img is None: raise ValueError(f"无法读取图片文件: {image_path}") im_h, im_w = img.shape[:2] print(f"[INFO] 图片尺寸: {im_w}x{im_h}") image_name = image_path.stem # 步骤1: 使用comic-text-detector检测文字区域(先检测文字块,用于辅助格子检测) print("[INFO] 步骤1: 检测文字区域...") try: mask, mask_refined, blk_list = detector( img, refine_mode=REFINEMASK_ANNOTATION, keep_undetected_mask=True ) except Exception as e: print(f"[ERROR] 检测失败: {e}") raise print(f"[OK] 检测到 {len(blk_list)} 个文字区域") # 步骤2: 使用文字遮罩图和文字块信息辅助检测漫画格子 print("[INFO] 步骤2: 检测漫画格子(使用文字遮罩图和文字块信息辅助)...") # 将文字块转换为统一格式 text_blocks = [] for blk in blk_list: x1, y1, x2, y2 = blk.xyxy text_blocks.append({ 'xyxy': [int(x1), int(y1), int(x2), int(y2)] }) # 使用文字遮罩图和文字块信息检测格子(优先使用文字遮罩图) panel_mask, panels = detect_comic_panels(img, text_blocks=text_blocks, text_mask=mask_refined) print(f"[OK] 检测到 {len(panels)} 个格子") # 如果检测到的格子太少,尝试不使用辅助信息重新检测 if len(panels) < 4: print(f"[WARN] 检测到的格子数量较少({len(panels)}个),尝试使用传统方法重新检测...") panel_mask_fallback, panels_fallback = detect_comic_panels(img, text_blocks=None, text_mask=None) if len(panels_fallback) > len(panels): panel_mask = panel_mask_fallback panels = panels_fallback print(f"[OK] 使用传统方法检测到 {len(panels)} 个格子") # 保存格子遮罩图到tmp目录(中间文件) panel_mask_path = tmp_dir / f"{image_name}_panel_mask.png" imwrite(str(panel_mask_path), panel_mask) print(f"[OK] 已保存格子遮罩图: {panel_mask_path}") # 保存格子信息JSON到tmp目录(中间文件) panels_json = { 'image_file': image_path.name, 'panels': panels, 'total_count': len(panels) } panels_json_path = tmp_dir / f"{image_name}_panels.json" with open(panels_json_path, 'w', encoding='utf-8') as f: json.dump(panels_json, f, ensure_ascii=False, indent=2) print(f"[OK] 已保存格子信息: {panels_json_path}") # 保存原始文字遮罩图到tmp目录(中间文件) text_mask_path = tmp_dir / f"{image_name}_text_mask.png" imwrite(str(text_mask_path), mask_refined) print(f"[OK] 已保存文字遮罩图: {text_mask_path}") # 步骤3: 合并格子遮罩图和文字mask图 print("[INFO] 步骤3: 合并格子遮罩图和文字mask图...") combined_mask = merge_panel_mask_with_text_mask(panel_mask, mask_refined) # 保存合并后的mask图片到tmp目录(中间文件) combined_mask_path = tmp_dir / f"{image_name}_combined_mask.png" print(f"[INFO] 步骤4: 保存合并后的mask图片到磁盘...") imwrite(str(combined_mask_path), combined_mask) print(f"[OK] 已保存合并后的mask图片: {combined_mask_path}") # 确认文件已生成 if not text_mask_path.exists(): raise FileNotFoundError(f"文字遮罩图文件未成功生成: {text_mask_path}") print(f"[OK] 已确认文字遮罩图文件存在") # 步骤5: 从保存的mask文件中读取,裁剪每个文字区域,然后识别 print(f"[INFO] 步骤5: 从mask文件中读取并识别 {len(blk_list)} 个文字区域...") # 使用合并后的mask(已经在内存中,不需要重新读取) mask_img = combined_mask dialogues = [] for i, blk in enumerate(blk_list): x1, y1, x2, y2 = blk.xyxy # 确保坐标在图片范围内 x1 = max(0, int(x1)) y1 = max(0, int(y1)) x2 = min(im_w, int(x2)) y2 = min(im_h, int(y2)) # 从mask图片中裁剪对应的文字区域 crop_mask = mask_img[y1:y2, x1:x2] if crop_mask.size == 0: continue # 同时从原图中裁剪对应的文字区域(用于OCR识别,效果更好) crop_img = img[y1:y2, x1:x2] # 确保是RGB格式(Tesseract可以直接使用,但统一使用RGB格式) if len(crop_img.shape) == 2: # 如果是灰度图,转换为RGB crop_img = cv2.cvtColor(crop_img, cv2.COLOR_GRAY2RGB) elif len(crop_img.shape) == 3 and crop_img.shape[2] == 4: # 如果是RGBA,转换为RGB crop_img = cv2.cvtColor(crop_img, cv2.COLOR_RGBA2RGB) # 对图片进行预处理以提高OCR识别率(保守处理,避免过度处理) # 1. 转换为灰度图 if len(crop_img.shape) == 3: gray = cv2.cvtColor(crop_img, cv2.COLOR_RGB2GRAY) else: gray = crop_img # 2. 检测是否为黑底白字(黑白漫画) # 计算图片的平均亮度 mean_brightness = np.mean(gray) is_dark_background = mean_brightness < 127 # 如果平均亮度小于127,可能是黑底 # 如果是黑底白字,先反转颜色(OCR模型通常训练在白底黑字上) if is_dark_background: gray = cv2.bitwise_not(gray) # 3. 适度放大图片(仅对很小的文字区域) h, w = gray.shape[:2] if h < 32 or w < 32: # 只有很小的文字区域才放大 scale = 2.0 new_h, new_w = int(h * scale), int(w * scale) gray = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_CUBIC) # 4. 增强对比度(使用CLAHE,保守设置) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) enhanced = clahe.apply(gray) # 5. 轻度去噪处理(避免过度模糊) enhanced = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21) # 6. 转换回RGB格式(Tesseract可以直接使用灰度图,但RGB也可以) if len(enhanced.shape) == 2: crop_img_processed = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2RGB) else: crop_img_processed = enhanced text_block = { 'index': i + 1, 'bbox': { 'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2, 'width': x2 - x1, 'height': y2 - y1, 'center_x': (x1 + x2) / 2, 'center_y': (y1 + y2) / 2 } } try: if ocr_engine == 'paddleocr': # 使用PaddleOCR识别 try: # PaddleOCR返回格式: [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], (text, confidence), ...] ocr_result = paddleocr_instance.ocr(crop_img_processed) # 调试:打印OCR结果格式 if ocr_result: print(f" [DEBUG] 第 {i+1} 个区域: ocr_result类型={type(ocr_result)}, 长度={len(ocr_result) if isinstance(ocr_result, (list, tuple)) else 'N/A'}") if len(ocr_result) > 0: result_item = ocr_result[0] print(f" [DEBUG] 第 {i+1} 个区域: ocr_result[0]类型={type(result_item)}") # 检查OCRResult对象的属性 if hasattr(result_item, '__dict__'): print(f" [DEBUG] 第 {i+1} 个区域: OCRResult属性={list(result_item.__dict__.keys())}") # 尝试转换为列表或字典 try: if hasattr(result_item, 'text_lines') or hasattr(result_item, 'texts'): print(f" [DEBUG] 第 {i+1} 个区域: 尝试访问text_lines或texts属性") except: pass if ocr_result and len(ocr_result) > 0: # PaddleOCR 3.x 返回的是 OCRResult 对象 result_item = ocr_result[0] # OCRResult 对象有 json 属性,返回字典格式 # 结构: {'res': {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...], 'rec_boxes': [...]}} try: result_json = result_item.json res_data = result_json.get('res', {}) if isinstance(result_json, dict) else {} # 提取文本、置信度、坐标 rec_texts = res_data.get('rec_texts', []) rec_scores = res_data.get('rec_scores', []) rec_polys = res_data.get('rec_polys', []) # 多边形坐标 [[[x1,y1],[x2,y2],[x3,y3],[x4,y4]], ...] rec_boxes = res_data.get('rec_boxes', []) # 边界框 [[x1,y1,x2,y2], ...] if not rec_texts: print(f" [DEBUG] 第 {i+1} 个区域: PaddleOCR未识别到文字") continue text_lines_with_bbox = [] all_texts = [] all_char_boxes_list = [] # 关键改进:先收集所有文本行和它们的边界框,然后对整个文本区域进行字符检测 # 这样可以确保OpenCV检测到所有字符,而不仅仅是单个文本行的字符 # 解析PaddleOCR结果,收集所有文本行 all_text_lines = [] # 存储所有文本行及其边界框 for idx, text in enumerate(rec_texts): if not text or not text.strip(): continue # 获取置信度 confidence = float(rec_scores[idx]) if idx < len(rec_scores) else 0.9 # 获取坐标(优先使用多边形坐标,如果没有则使用边界框) if idx < len(rec_polys) and rec_polys[idx]: bbox_coords = rec_polys[idx] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] elif idx < len(rec_boxes) and rec_boxes[idx]: # 将边界框转换为多边形格式 box = rec_boxes[idx] # [x1, y1, x2, y2] bbox_coords = [ [box[0], box[1]], # 左上 [box[2], box[1]], # 右上 [box[2], box[3]], # 右下 [box[0], box[3]] # 左下 ] else: print(f" [DEBUG] 第 {i+1} 个区域: 第 {idx} 个文本没有坐标信息") continue if text and text.strip(): # 计算边界框 # 检查 bbox_coords 格式 if not isinstance(bbox_coords, (list, tuple)) or len(bbox_coords) < 4: print(f" [DEBUG] 第 {i+1} 个区域: bbox_coords 格式不正确: {type(bbox_coords)}, {bbox_coords}") continue # 检查每个坐标点格式 try: x_coords = [] y_coords = [] for coord in bbox_coords: if isinstance(coord, (list, tuple)) and len(coord) >= 2: x_coords.append(coord[0]) y_coords.append(coord[1]) else: print(f" [DEBUG] 第 {i+1} 个区域: 坐标点格式不正确: {coord}") break if not x_coords or not y_coords or len(x_coords) < 4: print(f" [DEBUG] 第 {i+1} 个区域: 无法提取足够的坐标点") continue except (TypeError, IndexError) as e: print(f" [DEBUG] 第 {i+1} 个区域: 解析坐标失败: {e}, bbox_coords={bbox_coords}") continue left = min(x_coords) top = min(y_coords) right = max(x_coords) bottom = max(y_coords) # 转换为绝对坐标(相对于原图) char_bbox = { 'x1': float(x1 + left), 'y1': float(y1 + top), 'x2': float(x1 + right), 'y2': float(y1 + bottom), 'center_x': float(x1 + (left + right) / 2), 'center_y': float(y1 + (top + bottom) / 2) } text_lines_with_bbox.append({ 'text': text, 'bbox': char_bbox, 'confidence': confidence }) all_texts.append((text, confidence)) # 收集文本行信息,稍后统一处理 all_text_lines.append({ 'text': text, 'bbox': char_bbox, 'confidence': confidence }) # 关键改进:对所有文本行合并后的整个区域进行字符检测 if all_text_lines: # 计算整个文本区域的边界框(包含所有文本行) all_x1 = [line['bbox']['x1'] for line in all_text_lines] all_y1 = [line['bbox']['y1'] for line in all_text_lines] all_x2 = [line['bbox']['x2'] for line in all_text_lines] all_y2 = [line['bbox']['y2'] for line in all_text_lines] combined_bbox = { 'x1': float(min(all_x1)), 'y1': float(min(all_y1)), 'x2': float(max(all_x2)), 'y2': float(max(all_y2)), 'center_x': float((min(all_x1) + max(all_x2)) / 2), 'center_y': float((min(all_y1) + max(all_y2)) / 2) } # 合并所有文本行的文本 combined_text_for_detection = ''.join([line['text'] for line in all_text_lines]) # 使用OpenCV检测整个文本区域的所有字符 # 注意:text_bbox_for_detection必须使用绝对坐标(相对于原图) # 因为detect_characters_with_opencv函数期望的是原图坐标 text_bbox_for_detection = { 'x1': combined_bbox['x1'], 'y1': combined_bbox['y1'], 'x2': combined_bbox['x2'], 'y2': combined_bbox['y2'] } # 使用OpenCV检测字符位置(需要传入原图img,而不是crop_img) # 注意:坐标是相对于原图的,所以需要传入原图 # 传入OCR的边界框作为参考,提高识别率 detected_char_boxes = detect_characters_with_opencv( img, text_bbox_for_detection, combined_text_for_detection, ocr_bbox_hint=combined_bbox ) # 调试输出:检查OpenCV是否识别出所有字符 if '远道' in combined_text_for_detection or '石田' in combined_text_for_detection: print(f" [DEBUG] 合并后OCR文本: {combined_text_for_detection}") text_no_space_debug = combined_text_for_detection.replace(' ', '') print(f" [DEBUG] 去除空格后: {text_no_space_debug}, 字符数: {len(text_no_space_debug)}") print(f" [DEBUG] OpenCV检测到的字符框数: {len(detected_char_boxes)}") if len(detected_char_boxes) > 0: print(f" [DEBUG] 前3个字符框位置: center_x={[b['center_x'] for b in detected_char_boxes[:3]]}, center_y={[b['center_y'] for b in detected_char_boxes[:3]]}") text_no_space = combined_text_for_detection.replace(' ', '') if len(detected_char_boxes) > 0 and len(detected_char_boxes) == len(text_no_space): # 使用OpenCV检测到的精确位置 # 关键:OpenCV检测的字符框顺序可能与OCR文本顺序不一致 # 需要根据字符框的位置来匹配字符,而不是简单地按索引对应 # 方法1:将字符框按位置排序(在OpenCV坐标系中:从上到下、从右到左) # 注意:detect_characters_with_opencv函数返回的字符框可能已经按某种顺序排列 # 但我们需要确保按照正确的阅读顺序(从上到下、从右到左)排序 sorted_char_boxes = sorted(detected_char_boxes, key=lambda b: (b['y1'], -b['center_x'])) # 反转文本字符,使其与字符框的位置顺序对应 reversed_text_chars = list(text_no_space[::-1]) # 将排序后的字符框与反转后的文本字符对应 for k, char_box in enumerate(sorted_char_boxes): char = reversed_text_chars[k] if k < len(reversed_text_chars) else '?' all_char_boxes_list.append({ 'char': char, 'x1': char_box['x1'], 'y1': char_box['y1'], 'x2': char_box['x2'], 'y2': char_box['y2'], 'center_x': char_box['center_x'], 'center_y': char_box['center_y'] }) else: # 如果OpenCV检测失败,回退到估算方法 if len(text_no_space) > 0: bbox_width = right - left bbox_height = bottom - top is_vertical = bbox_height > bbox_width * 1.2 if is_vertical: # 竖排:字符从上到下(y坐标从小到大) char_height = bbox_height / len(text_no_space) for k, char in enumerate(text_no_space): char_x = char_bbox['center_x'] char_y = char_bbox['y1'] + char_height * (k + 0.5) all_char_boxes_list.append({ 'char': char, 'x1': char_x - 5, 'y1': char_y - char_height/2, 'x2': char_x + 5, 'y2': char_y + char_height/2, 'center_x': char_x, 'center_y': char_y }) else: # 横排:字符从左到右(估算) char_width = bbox_width / len(text_no_space) for k, char in enumerate(text_no_space): char_x = char_bbox['x1'] + char_width * (k + 0.5) char_y = char_bbox['center_y'] all_char_boxes_list.append({ 'char': char, 'x1': char_x - char_width/2, 'y1': char_y - 5, 'x2': char_x + char_width/2, 'y2': char_y + 5, 'center_x': char_x, 'center_y': char_y }) # 合并所有文字 if all_texts: # 先对文字行进行排序(从右到左、从上到下) # 注意:对于日式漫画,阅读顺序是从右到左、从上到下 # 排序规则:先按Y坐标从上到下(y1越小越靠上),然后按X坐标从右到左(center_x越大越靠右) if len(text_lines_with_bbox) > 1: text_lines_with_bbox.sort(key=lambda line: (line['bbox']['y1'], -line['bbox']['center_x'])) # 从排序后的text_lines_with_bbox中提取文本 text_lines = [line['text'] for line in text_lines_with_bbox] combined_text = ' '.join(text_lines) avg_confidence = sum([t[1] for t in all_texts]) / len(all_texts) if all_texts else 0.0 # 使用字符位置信息 character_positions = [] if all_char_boxes_list and len(all_char_boxes_list) > 0: # 获取图片高度(用于坐标转换) img_height = img.shape[0] # 注意:字符框已经在前面按位置排序并与文本字符对应了 # 这里不需要再次排序,保持字符与坐标的对应关系 # 直接使用all_char_boxes_list,保持字符与坐标的对应关系 for char_box in all_char_boxes_list: # 将坐标转换为数学坐标系(左下角为原点,向上为y轴正方向) # 转换中心坐标 center_x_old = char_box['center_x'] center_y_old = char_box['center_y'] center_x_new, center_y_new = convert_coordinate_to_math_system(center_x_old, center_y_old, img_height) # 转换边界框坐标(用于更精确的位置信息) x1_old = char_box['x1'] y1_old = char_box['y1'] x2_old = char_box['x2'] y2_old = char_box['y2'] x1_new, y1_new = convert_coordinate_to_math_system(x1_old, y1_old, img_height) x2_new, y2_new = convert_coordinate_to_math_system(x2_old, y2_old, img_height) # 注意:在数学坐标系中,y1_new > y2_new(因为y1在原图中更靠上,转换后y值更大) # 所以需要确保y1是上边界(y值更大),y2是下边界(y值更小) y1_math = max(y1_new, y2_new) # 上边界(y值更大) y2_math = min(y1_new, y2_new) # 下边界(y值更小) character_positions.append({ 'x': center_x_new, # 转换后的中心x坐标(数学坐标系) 'y': center_y_new, # 转换后的中心y坐标(数学坐标系) 'center_x': center_x_new, # 转换后的中心x坐标 'center_y': center_y_new, # 转换后的中心y坐标 'x1': min(x1_new, x2_new), # 转换后的左边界x坐标 'y1': y1_math, # 转换后的上边界y坐标(数学坐标系中y值更大) 'x2': max(x1_new, x2_new), # 转换后的右边界x坐标 'y2': y2_math, # 转换后的下边界y坐标(数学坐标系中y值更小) 'x_old': center_x_old, # 保留原始中心x坐标(用于调试) 'y_old': center_y_old # 保留原始中心y坐标(用于调试) }) # 如果字符位置数量不匹配,清空 text_no_space_for_check = combined_text.replace(' ', '') if len(character_positions) != len(text_no_space_for_check): if '远道' in combined_text or '石田' in combined_text: print(f" [DEBUG] 字符位置数量不匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}, text=\"{combined_text}\"") character_positions = [] elif '远道' in combined_text or '石田' in combined_text: print(f" [DEBUG] 字符位置数量匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}") print(f" [DEBUG] 前3个character_positions: {[{'x': p.get('center_x', p.get('x', 0)), 'y': p.get('center_y', p.get('y', 0))} for p in character_positions[:3]]}") # 调试输出:检查character_positions if ('远道' in combined_text or '石田' in combined_text) and character_positions: print(f" [DEBUG] 保存到dialogues: text=\"{combined_text}\", character_positions数量={len(character_positions)}") if combined_text and combined_text.strip(): dialogues.append({ 'order': i + 1, 'text': combined_text, 'bbox': text_block['bbox'], 'confidence': avg_confidence, 'character_positions': character_positions if character_positions else None }) text_preview = combined_text[:30] + '...' if len(combined_text) > 30 else combined_text print(f" [{i+1}/{len(blk_list)}] 识别: {text_preview} (置信度: {avg_confidence:.2f})") else: print(f" [DEBUG] 第 {i+1} 个区域: combined_text为空 (all_texts长度: {len(all_texts)})") else: print(f" [DEBUG] 第 {i+1} 个区域未识别到文字 (all_texts为空)") except Exception as e: print(f" [WARN] PaddleOCR解析第 {i+1} 个区域结果失败: {e}") import traceback traceback.print_exc() continue except Exception as e: print(f" [WARN] PaddleOCR识别第 {i+1} 个区域失败: {e}") import traceback traceback.print_exc() continue else: # 只使用PaddleOCR,如果失败则报错 raise RuntimeError(f"OCR引擎不是PaddleOCR,当前引擎: {ocr_engine}") except Exception as e: print(f" [WARN] 识别第 {i+1} 个区域失败: {e}") import traceback traceback.print_exc() continue print(f"[OK] 成功识别 {len(dialogues)} 段文字") # 步骤6: 按格子位置排序(越往上、越往右的格子里的对话顺序越靠前) print("[INFO] 步骤6: 按格子位置排序...") sorted_dialogues = sort_text_blocks_by_panels(dialogues, panels, im_w, im_h) # 重新分配order,保留order、text、bbox和character_positions字段 formatted_dialogues = [] for i, dialogue in enumerate(sorted_dialogues, 1): formatted_dialogues.append({ 'order': i, 'text': dialogue['text'], 'bbox': dialogue.get('bbox', {}), # 保留bbox信息用于排序 'character_positions': dialogue.get('character_positions') # 保留字符位置信息用于字符排序 }) # 步骤7: 保存JSON结果到output_dir(ocr目录,最终结果) print("[INFO] 步骤7: 保存JSON结果...") result = { 'image_file': image_path.name, 'reading_order': '从右到左、从上到下(日式漫画阅读顺序)', 'dialogues': formatted_dialogues, 'total_count': len(formatted_dialogues) } # 保存JSON到output_dir(ocr目录,最终结果文件) json_path = output_dir / f"{image_name}_dialogues.json" with open(json_path, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"[OK] 已保存对白结果: {json_path}") return result def batch_detect_and_ocr(image_dir, model_path=None, output_dir=None): """ 批量处理目录下所有图片 """ image_dir = Path(image_dir) if not image_dir.exists(): raise FileNotFoundError(f"图片目录不存在: {image_dir}") # 获取所有图片文件,按文件名数字排序 image_files = [] for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.webp']: image_files.extend(image_dir.glob(ext)) image_files.extend(image_dir.glob(ext.upper())) # 按文件名开头的数字排序 image_files = sorted(image_files, key=lambda x: int(x.stem.split('_')[0]) if x.stem.split('_')[0].isdigit() else 0) print(f"[INFO] 找到 {len(image_files)} 张图片") # 设置输出目录 if output_dir is None: output_dir = image_dir / 'ocr' else: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) results = [] for i, image_file in enumerate(image_files, 1): print(f"\n[{i}/{len(image_files)}] 处理: {image_file.name}") try: result = detect_and_ocr_comic(image_file, model_path, output_dir) results.append(result) except Exception as e: print(f"[ERROR] 处理 {image_file.name} 失败: {e}") import traceback traceback.print_exc() continue print(f"\n[OK] 批量处理完成,成功处理 {len(results)} 张图片") return results if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='检测漫画文字区域并用OCR识别') parser.add_argument('input', help='输入图片路径或目录') parser.add_argument('-o', '--output', help='输出目录') parser.add_argument('-m', '--model', help='comic-text-detector模型路径') args = parser.parse_args() input_path = Path(args.input) if input_path.is_file(): detect_and_ocr_comic(input_path, args.model, args.output) elif input_path.is_dir(): batch_detect_and_ocr(input_path, args.model, args.output) else: print(f"[ERROR] 输入路径不存在: {input_path}") sys.exit(1)