yichael
/
AIStoryBoard


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415
							# -*- coding: utf-8 -*-
"""
使用 comic-text-detector 检测文字区域，然后用 PaddleOCR 识别文字内容
按日式漫画阅读顺序（从右到左、从上到下）排序
"""

import sys
import os
import json
from pathlib import Path
import cv2
import numpy as np


def convert_coordinate_to_math_system(x, y, image_height):
    """
    将OpenCV坐标系转换为数学坐标系
    
    原始坐标系（OpenCV）：左上角(0,0)，向下为y轴正方向，向右为x轴正方向
    目标坐标系（数学）：左下角(0,0)，向上为y轴正方向，向右为x轴正方向
    
    转换公式：
    - x_new = x_old (x坐标不变)
    - y_new = imageHeight - y_old (y坐标翻转)
    
    参数:
        x: 原始x坐标
        y: 原始y坐标
        image_height: 图片高度
    
    返回:
        (x_new, y_new): 转换后的坐标
    """
    x_new = x
    y_new = image_height - y
    return x_new, y_new


def detect_characters_with_opencv(img, text_bbox, text_content, ocr_bbox_hint=None):
    """
    使用OpenCV在文本区域内精确定位每个字符
    
    参数:
        img: 原始图像（BGR格式）
        text_bbox: 文本边界框，格式 {'x1': int, 'y1': int, 'x2': int, 'y2': int}
        text_content: 文本内容（用于验证字符数量）
        ocr_bbox_hint: OCR提供的文本边界框（可选，用于辅助识别）
    
    返回:
        char_boxes: 字符边界框列表，每个元素包含 {'x1', 'y1', 'x2', 'y2', 'center_x', 'center_y'}
        确保字符数量与OCR文本一致，且字符框不重叠
    """
    # 提取文本区域
    x1 = int(text_bbox['x1'])
    y1 = int(text_bbox['y1'])
    x2 = int(text_bbox['x2'])
    y2 = int(text_bbox['y2'])
    
    # 确保坐标在图像范围内
    h, w = img.shape[:2]
    x1 = max(0, x1)
    y1 = max(0, y1)
    x2 = min(w, x2)
    y2 = min(h, y2)
    
    if x2 <= x1 or y2 <= y1:
        return []
    
    # 提取文本区域ROI
    text_roi = img[y1:y2, x1:x2].copy()
    
    if text_roi.size == 0:
        return []
    
    # 转换为灰度图
    if len(text_roi.shape) == 3:
        gray_roi = cv2.cvtColor(text_roi, cv2.COLOR_BGR2GRAY)
    else:
        gray_roi = text_roi
    
    # 二值化处理
    # 使用自适应阈值，因为文本区域可能有不同的光照条件
    binary = cv2.adaptiveThreshold(
        gray_roi, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
        cv2.THRESH_BINARY_INV, 11, 2
    )
    
    # 形态学操作：去除噪点，连接字符笔画
    kernel = np.ones((2, 2), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel, iterations=1)
    
    # 判断是竖排还是横排
    roi_height, roi_width = binary.shape
    is_vertical = roi_height > roi_width * 1.2
    
    # 保存布局信息，用于后续估算
    layout_info = {'is_vertical': is_vertical, 'roi_width': roi_width, 'roi_height': roi_height}
    
    char_boxes = []
    
    if is_vertical:
        # 竖排文字：使用垂直投影来分割字符
        # 计算垂直投影（每列的白色像素数量）
        vertical_projection = np.sum(binary, axis=0)
        
        # 找到字符之间的空白列（投影值接近0）
        threshold = np.max(vertical_projection) * 0.1
        char_boundaries = []
        
        in_char = False
        start_col = 0
        
        for col in range(len(vertical_projection)):
            if vertical_projection[col] > threshold:
                if not in_char:
                    in_char = True
                    start_col = col
            else:
                if in_char:
                    in_char = False
                    # 字符结束位置（使用中间位置作为分割点）
                    end_col = col
                    char_boundaries.append((start_col, end_col))
        
        # 处理最后一个字符
        if in_char:
            char_boundaries.append((start_col, len(vertical_projection)))
        
        # 为每个字符区域计算水平边界
        for start_col, end_col in char_boundaries:
            char_col_roi = binary[:, start_col:end_col]
            horizontal_projection = np.sum(char_col_roi, axis=1)
            
            # 找到字符的上下边界
            char_rows = np.where(horizontal_projection > 0)[0]
            if len(char_rows) > 0:
                top_row = char_rows[0]
                bottom_row = char_rows[-1]
                
                # 转换为原图坐标
                char_x1 = x1 + start_col
                char_y1 = y1 + top_row
                char_x2 = x1 + end_col
                char_y2 = y1 + bottom_row
                
                char_boxes.append({
                    'x1': float(char_x1),
                    'y1': float(char_y1),
                    'x2': float(char_x2),
                    'y2': float(char_y2),
                    'center_x': float((char_x1 + char_x2) / 2),
                    'center_y': float((char_y1 + char_y2) / 2)
                })
    else:
        # 横排文字：使用水平投影来分割字符
        # 计算水平投影（每行的白色像素数量）
        horizontal_projection = np.sum(binary, axis=1)
        
        # 找到字符之间的空白行（投影值接近0）
        threshold = np.max(horizontal_projection) * 0.1
        char_boundaries = []
        
        in_char = False
        start_row = 0
        
        for row in range(len(horizontal_projection)):
            if horizontal_projection[row] > threshold:
                if not in_char:
                    in_char = True
                    start_row = row
            else:
                if in_char:
                    in_char = False
                    # 字符结束位置
                    end_row = row
                    char_boundaries.append((start_row, end_row))
        
        # 处理最后一个字符
        if in_char:
            char_boundaries.append((start_row, len(horizontal_projection)))
        
        # 为每个字符区域计算垂直边界
        for start_row, end_row in char_boundaries:
            char_row_roi = binary[start_row:end_row, :]
            vertical_projection = np.sum(char_row_roi, axis=0)
            
            # 找到字符的左右边界
            char_cols = np.where(vertical_projection > 0)[0]
            if len(char_cols) > 0:
                left_col = char_cols[0]
                right_col = char_cols[-1]
                
                # 转换为原图坐标
                char_x1 = x1 + left_col
                char_y1 = y1 + start_row
                char_x2 = x1 + right_col
                char_y2 = y1 + end_row
                
                char_boxes.append({
                    'x1': float(char_x1),
                    'y1': float(char_y1),
                    'x2': float(char_x2),
                    'y2': float(char_y2),
                    'center_x': float((char_x1 + char_x2) / 2),
                    'center_y': float((char_y1 + char_y2) / 2)
                })
    
    # 如果投影方法检测到的字符数量与文本内容不匹配，使用轮廓检测作为主要方法
    text_no_space = text_content.replace(' ', '')
    expected_char_count = len(text_no_space)
    
    # 使用轮廓检测作为主要方法（更精确）
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # 过滤掉太小的轮廓（可能是噪点）
    min_area = (roi_width * roi_height) / (expected_char_count * 20)  # 更严格的面积阈值
    max_area = (roi_width * roi_height) / 2  # 最大面积（避免误检测）
    
    valid_contours = []
    for contour in contours:
        area = cv2.contourArea(contour)
        if min_area < area < max_area:
            # 计算轮廓的宽高比，过滤掉明显不是字符的轮廓
            x, y, w, h = cv2.boundingRect(contour)
            aspect_ratio = h / w if w > 0 else 0
            # 字符的宽高比通常在合理范围内
            if 0.2 < aspect_ratio < 5.0:
                valid_contours.append(contour)
    
    # 如果轮廓数量合理，使用轮廓结果（更精确）
    if len(valid_contours) > 0:
        # 按位置排序轮廓（从上到下、从右到左）
        contour_boxes = []
        for contour in valid_contours:
            x, y, w, h = cv2.boundingRect(contour)
            # 转换为原图坐标
            char_x1 = float(x1 + x)
            char_y1 = float(y1 + y)
            char_x2 = float(x1 + x + w)
            char_y2 = float(y1 + y + h)
            
            contour_boxes.append({
                'x1': char_x1,
                'y1': char_y1,
                'x2': char_x2,
                'y2': char_y2,
                'center_x': float(char_x1 + w / 2),
                'center_y': float(char_y1 + h / 2),
                'area': area
            })
        
        # 按位置排序（从上到下、从右到左）
        contour_boxes.sort(key=lambda b: (b['y1'], -b['center_x']))
        
        # 如果轮廓数量接近预期，使用轮廓结果
        if abs(len(contour_boxes) - expected_char_count) <= abs(len(char_boxes) - expected_char_count):
            char_boxes = contour_boxes
    
    # 关键改进：确保字符数量与OCR文本一致，且字符框不重叠
    # 如果识别出的字符数量不匹配，使用OCR坐标作为参考来辅助识别
    if len(char_boxes) != expected_char_count and ocr_bbox_hint:
        # 使用OCR提供的边界框作为参考，估算字符位置
        char_boxes = refine_char_boxes_with_ocr_hint(
            img, text_bbox, text_content, char_boxes, ocr_bbox_hint, expected_char_count
        )
    
    # 确保字符框不重叠
    char_boxes = remove_overlapping_boxes(char_boxes, expected_char_count)
    
    # 如果字符数量仍然不匹配，使用估算方法
    if len(char_boxes) != expected_char_count:
        char_boxes = estimate_char_boxes_from_text_bbox(
            text_bbox, text_content, expected_char_count, is_vertical
        )
    
    return char_boxes


def refine_char_boxes_with_ocr_hint(img, text_bbox, text_content, detected_boxes, ocr_bbox_hint, expected_count):
    """
    使用OCR提供的边界框作为参考，改进字符检测
    
    参数:
        img: 原始图像
        text_bbox: 文本边界框
        text_content: 文本内容
        detected_boxes: 已检测到的字符框列表
        ocr_bbox_hint: OCR提供的文本边界框
        expected_count: 期望的字符数量
    
    返回:
        改进后的字符框列表
    """
    # 如果已检测到的字符框数量接近期望值，直接返回
    if abs(len(detected_boxes) - expected_count) <= 2:
        return detected_boxes
    
    # 使用OCR边界框估算字符位置
    text_no_space = text_content.replace(' ', '')
    roi_width = text_bbox['x2'] - text_bbox['x1']
    roi_height = text_bbox['y2'] - text_bbox['y1']
    is_vertical = roi_height > roi_width * 1.2
    
    estimated_boxes = []
    
    if is_vertical:
        # 竖排：估算每个字符的位置
        # 估算列数和行数
        estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8)))
        estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols
        
        char_width = roi_width / estimated_cols
        char_height = roi_height / estimated_rows
        
        # 如果有已检测到的字符框，使用它们的位置来调整估算
        if len(detected_boxes) > 0:
            # 使用已检测到的字符框位置来调整估算
            for i in range(expected_count):
                col = i % estimated_cols
                row = i // estimated_cols
                
                est_x = text_bbox['x1'] + col * char_width + char_width / 2
                est_y = text_bbox['y1'] + row * char_height + char_height / 2
                
                # 找到最近的已检测字符框
                min_dist = float('inf')
                best_box = None
                for box in detected_boxes:
                    dist = abs(box['center_x'] - est_x) + abs(box['center_y'] - est_y)
                    if dist < min_dist:
                        min_dist = dist
                        best_box = box
                
                if best_box and min_dist < char_width:
                    # 使用已检测到的字符框
                    estimated_boxes.append(best_box)
                else:
                    # 使用估算位置
                    estimated_boxes.append({
                        'x1': float(est_x - char_width / 2),
                        'y1': float(est_y - char_height / 2),
                        'x2': float(est_x + char_width / 2),
                        'y2': float(est_y + char_height / 2),
                        'center_x': float(est_x),
                        'center_y': float(est_y)
                    })
        else:
            # 完全使用估算
            for i in range(expected_count):
                col = i % estimated_cols
                row = i // estimated_cols
                est_x = text_bbox['x1'] + col * char_width + char_width / 2
                est_y = text_bbox['y1'] + row * char_height + char_height / 2
                estimated_boxes.append({
                    'x1': float(est_x - char_width / 2),
                    'y1': float(est_y - char_height / 2),
                    'x2': float(est_x + char_width / 2),
                    'y2': float(est_y + char_height / 2),
                    'center_x': float(est_x),
                    'center_y': float(est_y)
                })
    else:
        # 横排：估算每个字符的位置
        char_width = roi_width / expected_count
        char_height = roi_height
        
        for i in range(expected_count):
            x = text_bbox['x1'] + i * char_width + char_width / 2
            y = text_bbox['y1'] + roi_height / 2
            
            estimated_boxes.append({
                'x1': float(x - char_width / 2),
                'y1': float(y - char_height / 2),
                'x2': float(x + char_width / 2),
                'y2': float(y + char_height / 2),
                'center_x': float(x),
                'center_y': float(y)
            })
    
    return estimated_boxes[:expected_count]


def remove_overlapping_boxes(char_boxes, expected_count):
    """
    移除重叠的字符框，确保字符框不重叠
    
    参数:
        char_boxes: 字符框列表
        expected_count: 期望的字符数量
    
    返回:
        去重后的字符框列表
    """
    if len(char_boxes) <= expected_count:
        return char_boxes
    
    # 按位置排序
    sorted_boxes = sorted(char_boxes, key=lambda b: (b['y1'], b['center_x']))
    
    # 移除重叠的字符框
    non_overlapping = []
    for box in sorted_boxes:
        is_overlapping = False
        for existing_box in non_overlapping:
            # 计算重叠面积
            overlap_x1 = max(box['x1'], existing_box['x1'])
            overlap_y1 = max(box['y1'], existing_box['y1'])
            overlap_x2 = min(box['x2'], existing_box['x2'])
            overlap_y2 = min(box['y2'], existing_box['y2'])
            
            if overlap_x2 > overlap_x1 and overlap_y2 > overlap_y1:
                overlap_area = (overlap_x2 - overlap_x1) * (overlap_y2 - overlap_y1)
                box_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1'])
                existing_area = (existing_box['x2'] - existing_box['x1']) * (existing_box['y2'] - existing_box['y1'])
                
                # 如果重叠面积超过较小框的50%，认为是重叠
                if overlap_area > min(box_area, existing_area) * 0.5:
                    is_overlapping = True
                    break
        
        if not is_overlapping:
            non_overlapping.append(box)
    
    # 如果去重后数量不足，尝试合并相近的字符框
    if len(non_overlapping) < expected_count:
        # 按位置分组，合并相近的字符框
        grouped = []
        for box in sorted_boxes:
            added = False
            for group in grouped:
                # 检查是否与组内任何框相近
                for group_box in group:
                    dist = abs(box['center_x'] - group_box['center_x']) + abs(box['center_y'] - group_box['center_y'])
                    if dist < 20:  # 如果距离小于20像素，认为是同一个字符
                        group.append(box)
                        added = True
                        break
                if added:
                    break
            
            if not added:
                grouped.append([box])
        
        # 对每个组，选择最大的字符框
        non_overlapping = []
        for group in grouped:
            largest = max(group, key=lambda b: (b['x2'] - b['x1']) * (b['y2'] - b['y1']))
            non_overlapping.append(largest)
    
    return non_overlapping[:expected_count]


def estimate_char_boxes_from_text_bbox(text_bbox, text_content, expected_count, is_vertical):
    """
    从文本边界框估算字符位置（当OpenCV检测失败时使用）
    
    参数:
        text_bbox: 文本边界框
        text_content: 文本内容
        expected_count: 期望的字符数量
        is_vertical: 是否为竖排
    
    返回:
        估算的字符框列表
    """
    text_no_space = text_content.replace(' ', '')
    roi_width = text_bbox['x2'] - text_bbox['x1']
    roi_height = text_bbox['y2'] - text_bbox['y1']
    
    estimated_boxes = []
    
    if is_vertical:
        # 竖排：估算每个字符的位置
        # 估算列数和行数
        estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8)))
        estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols
        
        char_width = roi_width / estimated_cols
        char_height = roi_height / estimated_rows
        
        for i in range(expected_count):
            col = i % estimated_cols
            row = i // estimated_cols
            
            x = text_bbox['x1'] + col * char_width + char_width / 2
            y = text_bbox['y1'] + row * char_height + char_height / 2
            
            estimated_boxes.append({
                'x1': float(x - char_width / 2),
                'y1': float(y - char_height / 2),
                'x2': float(x + char_width / 2),
                'y2': float(y + char_height / 2),
                'center_x': float(x),
                'center_y': float(y)
            })
    else:
        # 横排：估算每个字符的位置
        char_width = roi_width / expected_count
        char_height = roi_height
        
        for i in range(expected_count):
            x = text_bbox['x1'] + i * char_width + char_width / 2
            y = text_bbox['y1'] + roi_height / 2
            
            estimated_boxes.append({
                'x1': float(x - char_width / 2),
                'y1': float(y - char_height / 2),
                'x2': float(x + char_width / 2),
                'y2': float(y + char_height / 2),
                'center_x': float(x),
                'center_y': float(y)
            })
    
    return estimated_boxes

# 禁用 oneDNN 以避免 NotImplementedError（PaddlePaddle 3.3.0 的已知问题）
os.environ['FLAGS_onednn'] = '0'
os.environ['FLAGS_use_mkldnn'] = '0'

# Windows编码修复
if sys.platform == 'win32':
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')

# 添加comic-text-detector路径
project_root = Path(__file__).parent.parent
comic_detector_path = project_root / 'comic-text-detector-master' / 'comic-text-detector-master'
sys.path.insert(0, str(comic_detector_path))

# 添加OnnxOCR本地路径（作为回退选项）
onnxocr_path = project_root / 'OnnxOCR-main' / 'OnnxOCR-main'
if onnxocr_path.exists():
    sys.path.insert(0, str(onnxocr_path))

# 处理 wandb 可选依赖（comic-text-detector 需要但推理时不需要）
try:
    import wandb
except ImportError:
    # 创建一个假的 wandb 模块，避免导入错误
    class FakeWandb:
        @staticmethod
        def init(*args, **kwargs):
            return None
        @staticmethod
        def log(*args, **kwargs):
            pass
        @staticmethod
        def log_model(*args, **kwargs):
            pass
    sys.modules['wandb'] = FakeWandb()

try:
    from inference import TextDetector, REFINEMASK_ANNOTATION
    from utils.io_utils import imread, imwrite
except ImportError as e:
    print(f"[ERROR] 无法导入comic-text-detector模块: {e}")
    print(f"[INFO] 请确保已安装依赖: pip install torch torchvision opencv-python numpy tqdm")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# PaddleOCR（唯一使用）
try:
    # 添加PaddleOCR路径
    paddleocr_path = project_root / 'PaddleOCR-main' / 'PaddleOCR-main'
    if paddleocr_path.exists():
        sys.path.insert(0, str(paddleocr_path))
    from paddleocr import PaddleOCR
    PADDLEOCR_AVAILABLE = True
    print("[INFO] PaddleOCR 可用")
except ImportError as e:
    print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
    print("[ERROR] PaddleOCR 是必需的，请确保已正确安装")
    PADDLEOCR_AVAILABLE = False


# 格子识别代码已移动到 python/generate-anim/detect_panels.py
# 通过导入使用
try:
    # 添加当前目录到路径，以便导入同目录下的模块
    import sys
    current_dir = Path(__file__).parent
    if str(current_dir) not in sys.path:
        sys.path.insert(0, str(current_dir))
    from detect_panels import detect_comic_panels, merge_panel_mask_with_text_mask
except ImportError as e:
    print(f"[WARN] 无法导入detect_panels模块，使用本地实现: {e}")
    # 如果导入失败，使用本地实现（向后兼容）
    def detect_comic_panels(img):
        """使用opencv检测漫画格子（分镜框）- 本地实现"""
        if len(img.shape) == 3:
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        else:
            gray = img.copy()
        
        panel_mask = np.zeros_like(gray)
        edges = cv2.Canny(gray, 50, 150, apertureSize=3)
        
        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
        horizontal_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, horizontal_kernel)
        
        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
        vertical_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, vertical_kernel)
        
        lines_mask = cv2.bitwise_or(horizontal_lines, vertical_lines)
        lines = cv2.HoughLinesP(lines_mask, 1, np.pi/180, threshold=100, 
                                minLineLength=50, maxLineGap=10)
        
        if lines is not None:
            for line in lines:
                x1, y1, x2, y2 = line[0]
                cv2.line(panel_mask, (x1, y1), (x2, y2), 255, 2)
        
        kernel = np.ones((3, 3), np.uint8)
        dilated = cv2.dilate(lines_mask, kernel, iterations=2)
        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        panels = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            area = w * h
            if area > img.shape[0] * img.shape[1] * 0.01:
                cv2.rectangle(panel_mask, (x, y), (x + w, y + h), 255, 2)
                panels.append({
                    'x': x,
                    'y': y,
                    'width': w,
                    'height': h,
                    'center_x': x + w / 2,
                    'center_y': y + h / 2
                })
        
        return panel_mask, panels
    
    def merge_panel_mask_with_text_mask(panel_mask, text_mask):
        """合并格子遮罩图和文字mask图"""
        if panel_mask.shape != text_mask.shape:
            panel_mask = cv2.resize(panel_mask, (text_mask.shape[1], text_mask.shape[0]))
        return np.maximum(panel_mask, text_mask)


def get_text_block_panel(text_block, panels):
    """
    判断文字块属于哪个格子
    
    参数:
        text_block: 文字块，包含bbox信息
        panels: 格子列表
    
    返回:
        panel_index: 格子索引，如果不在任何格子内返回-1
    """
    bbox = text_block['bbox']
    center_x = (bbox['x1'] + bbox['x2']) / 2
    center_y = (bbox['y1'] + bbox['y2']) / 2
    
    for i, panel in enumerate(panels):
        if (panel['x'] <= center_x <= panel['x'] + panel['width'] and
            panel['y'] <= center_y <= panel['y'] + panel['height']):
            return i
    
    return -1


def sort_text_blocks_by_panels(text_blocks, panels, image_width, image_height):
    """
    按日式漫画阅读顺序排序：从右到左、从上到下（竖着读取）
    
    排序规则：
    1. 先按列分组（从右到左）- 越往右的列越靠前
    2. 同一列内，按行排序（从上到下）- 越往上的行越靠前
    3. 同一格子内，按X坐标从右到左
    
    参数:
        text_blocks: 文字块列表，每个包含bbox信息
        panels: 格子列表
        image_width: 图片宽度
        image_height: 图片高度
    
    返回:
        排序后的文字块列表
    """
    if not text_blocks:
        return []
    
    # 计算每个文字块的中心点和所属格子
    for block in text_blocks:
        bbox = block['bbox']
        block['center_x'] = (bbox['x1'] + bbox['x2']) / 2
        block['center_y'] = (bbox['y1'] + bbox['y2']) / 2
        block['panel_index'] = get_text_block_panel(block, panels)
    
    # 排序规则（日式漫画：从右到左、从上到下竖着读取）：
    # 1. 先按X坐标分组（从右到左）- X坐标越大（越靠右）越靠前
    # 2. 同一列内，按Y坐标排序（从上到下）- Y坐标越小（越往上）越靠前
    # 3. 同一位置，按X坐标从右到左
    
    # 将图片分成列（从右到左）
    # 使用图片宽度的20%作为列的分组阈值（更宽松的分组）
    column_threshold = max(image_width * 0.2, 100)  # 至少100像素
    
    def sort_key(block):
        # 直接使用文字块的中心坐标，不依赖格子
        center_x = block['center_x']
        center_y = block['center_y']
        
        # 计算列号（从右到左，列号越小越靠右）
        # 将X坐标转换为列号：X坐标越大，列号越小（越靠右）
        # 使用 image_width - center_x 来计算距离右边的距离
        distance_from_right = image_width - center_x
        column = int(distance_from_right / column_threshold)
        
        # 使用列号和Y坐标作为主要排序依据
        # 列号越小（越靠右）越靠前，Y坐标越小（越往上）越靠前
        # 同一列同一行内，X坐标越大（越靠右）越靠前
        return (column, center_y, -center_x)
    
    sorted_blocks = sorted(text_blocks, key=sort_key)
    
    return sorted_blocks


def detect_and_ocr_comic(image_path, model_path=None, output_dir=None):
    """
    检测漫画文字区域并用OCR识别
    
    参数:
        image_path: 图片路径
        model_path: comic-text-detector模型路径
        output_dir: 输出目录
    """
    image_path = Path(image_path)
    
    if not image_path.exists():
        raise FileNotFoundError(f"图片文件不存在: {image_path}")
    
    print(f"📖 正在处理图片: {image_path.name}")
    
    # 设置模型路径
    if model_path is None:
        possible_paths = [
            comic_detector_path / 'data' / 'comictextdetector.pt',
            comic_detector_path / 'data' / 'comictextdetector.pt.onnx',
        ]
        model_path = None
        for path in possible_paths:
            if path.exists():
                model_path = path
                break
        
        if model_path is None:
            raise FileNotFoundError(
                f"未找到comic-text-detector模型文件。请下载模型并放到以下位置之一:\n" +
                "\n".join([f"  - {p}" for p in possible_paths])
            )
    
    # 设置输出目录
    if output_dir is None:
        output_dir = image_path.parent
    else:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    
    # 创建tmp子目录用于保存中间处理文件
    tmp_dir = output_dir / 'tmp'
    tmp_dir.mkdir(parents=True, exist_ok=True)
    
    # 初始化comic-text-detector
    device = 'cuda' if __import__('torch').cuda.is_available() else 'cpu'
    print(f"[INFO] 使用设备: {device}")
    
    try:
        detector = TextDetector(
            model_path=str(model_path),
            input_size=1024,
            device=device,
            act='leaky'
        )
    except Exception as e:
        print(f"[ERROR] 初始化检测器失败: {e}")
        raise
    
    # 初始化PaddleOCR（唯一使用）
    print("[INFO] 初始化PaddleOCR...")
    ocr_engine = None
    paddleocr_instance = None
    
    if not PADDLEOCR_AVAILABLE:
        raise RuntimeError("PaddleOCR 不可用，请确保已正确安装 paddlex[ocr-core]")
    
    try:
        # 初始化PaddleOCR，使用中文模型
        # enable_mkldnn=False 禁用 MKL-DNN 以避免 NotImplementedError
        # use_angle_cls=True 启用角度分类器，可以更好地识别竖排文字
        paddleocr_instance = PaddleOCR(
            use_angle_cls=True,  # 启用角度分类器，支持竖排文字识别
            lang='ch',  # 中文
            enable_mkldnn=False  # 禁用 MKL-DNN 以避免 oneDNN 错误
        )
        ocr_engine = 'paddleocr'
        print("[INFO] PaddleOCR 初始化成功")
    except Exception as e:
        print(f"[ERROR] PaddleOCR初始化失败: {e}")
        raise RuntimeError(f"PaddleOCR 初始化失败: {e}")
    
    # 读取图片
    img = imread(str(image_path))
    if img is None:
        raise ValueError(f"无法读取图片文件: {image_path}")
    
    im_h, im_w = img.shape[:2]
    print(f"[INFO] 图片尺寸: {im_w}x{im_h}")
    
    image_name = image_path.stem
    
    # 步骤1: 使用comic-text-detector检测文字区域（先检测文字块，用于辅助格子检测）
    print("[INFO] 步骤1: 检测文字区域...")
    try:
        mask, mask_refined, blk_list = detector(
            img,
            refine_mode=REFINEMASK_ANNOTATION,
            keep_undetected_mask=True
        )
    except Exception as e:
        print(f"[ERROR] 检测失败: {e}")
        raise
    
    print(f"[OK] 检测到 {len(blk_list)} 个文字区域")
    
    # 步骤2: 使用文字遮罩图和文字块信息辅助检测漫画格子
    print("[INFO] 步骤2: 检测漫画格子（使用文字遮罩图和文字块信息辅助）...")
    
    # 将文字块转换为统一格式
    text_blocks = []
    for blk in blk_list:
        x1, y1, x2, y2 = blk.xyxy
        text_blocks.append({
            'xyxy': [int(x1), int(y1), int(x2), int(y2)]
        })
    
    # 使用文字遮罩图和文字块信息检测格子（优先使用文字遮罩图）
    panel_mask, panels = detect_comic_panels(img, text_blocks=text_blocks, text_mask=mask_refined)
    print(f"[OK] 检测到 {len(panels)} 个格子")
    
    # 如果检测到的格子太少，尝试不使用辅助信息重新检测
    if len(panels) < 4:
        print(f"[WARN] 检测到的格子数量较少（{len(panels)}个），尝试使用传统方法重新检测...")
        panel_mask_fallback, panels_fallback = detect_comic_panels(img, text_blocks=None, text_mask=None)
        if len(panels_fallback) > len(panels):
            panel_mask = panel_mask_fallback
            panels = panels_fallback
            print(f"[OK] 使用传统方法检测到 {len(panels)} 个格子")
    
    # 保存格子遮罩图到tmp目录（中间文件）
    panel_mask_path = tmp_dir / f"{image_name}_panel_mask.png"
    imwrite(str(panel_mask_path), panel_mask)
    print(f"[OK] 已保存格子遮罩图: {panel_mask_path}")
    
    # 保存格子信息JSON到tmp目录（中间文件）
    panels_json = {
        'image_file': image_path.name,
        'panels': panels,
        'total_count': len(panels)
    }
    panels_json_path = tmp_dir / f"{image_name}_panels.json"
    with open(panels_json_path, 'w', encoding='utf-8') as f:
        json.dump(panels_json, f, ensure_ascii=False, indent=2)
    print(f"[OK] 已保存格子信息: {panels_json_path}")
    
    # 保存原始文字遮罩图到tmp目录（中间文件）
    text_mask_path = tmp_dir / f"{image_name}_text_mask.png"
    imwrite(str(text_mask_path), mask_refined)
    print(f"[OK] 已保存文字遮罩图: {text_mask_path}")
    
    # 步骤3: 合并格子遮罩图和文字mask图
    print("[INFO] 步骤3: 合并格子遮罩图和文字mask图...")
    combined_mask = merge_panel_mask_with_text_mask(panel_mask, mask_refined)
    
    # 保存合并后的mask图片到tmp目录（中间文件）
    combined_mask_path = tmp_dir / f"{image_name}_combined_mask.png"
    print(f"[INFO] 步骤4: 保存合并后的mask图片到磁盘...")
    imwrite(str(combined_mask_path), combined_mask)
    print(f"[OK] 已保存合并后的mask图片: {combined_mask_path}")
    
    # 确认文件已生成
    if not text_mask_path.exists():
        raise FileNotFoundError(f"文字遮罩图文件未成功生成: {text_mask_path}")
    print(f"[OK] 已确认文字遮罩图文件存在")
    
    # 步骤5: 从保存的mask文件中读取，裁剪每个文字区域，然后识别
    print(f"[INFO] 步骤5: 从mask文件中读取并识别 {len(blk_list)} 个文字区域...")
    
    # 使用合并后的mask（已经在内存中，不需要重新读取）
    mask_img = combined_mask
    
    dialogues = []
    
    for i, blk in enumerate(blk_list):
        x1, y1, x2, y2 = blk.xyxy
        
        # 确保坐标在图片范围内
        x1 = max(0, int(x1))
        y1 = max(0, int(y1))
        x2 = min(im_w, int(x2))
        y2 = min(im_h, int(y2))
        
        # 从mask图片中裁剪对应的文字区域
        crop_mask = mask_img[y1:y2, x1:x2]
        
        if crop_mask.size == 0:
            continue
        
        # 同时从原图中裁剪对应的文字区域（用于OCR识别，效果更好）
        crop_img = img[y1:y2, x1:x2]
        
        # 确保是RGB格式（Tesseract可以直接使用，但统一使用RGB格式）
        if len(crop_img.shape) == 2:
            # 如果是灰度图，转换为RGB
            crop_img = cv2.cvtColor(crop_img, cv2.COLOR_GRAY2RGB)
        elif len(crop_img.shape) == 3 and crop_img.shape[2] == 4:
            # 如果是RGBA，转换为RGB
            crop_img = cv2.cvtColor(crop_img, cv2.COLOR_RGBA2RGB)
        
        # 对图片进行预处理以提高OCR识别率（保守处理，避免过度处理）
        # 1. 转换为灰度图
        if len(crop_img.shape) == 3:
            gray = cv2.cvtColor(crop_img, cv2.COLOR_RGB2GRAY)
        else:
            gray = crop_img
        
        # 2. 检测是否为黑底白字（黑白漫画）
        # 计算图片的平均亮度
        mean_brightness = np.mean(gray)
        is_dark_background = mean_brightness < 127  # 如果平均亮度小于127，可能是黑底
        
        # 如果是黑底白字，先反转颜色（OCR模型通常训练在白底黑字上）
        if is_dark_background:
            gray = cv2.bitwise_not(gray)
        
        # 3. 适度放大图片（仅对很小的文字区域）
        h, w = gray.shape[:2]
        if h < 32 or w < 32:  # 只有很小的文字区域才放大
            scale = 2.0
            new_h, new_w = int(h * scale), int(w * scale)
            gray = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
        
        # 4. 增强对比度（使用CLAHE，保守设置）
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(gray)
        
        # 5. 轻度去噪处理（避免过度模糊）
        enhanced = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
        
        # 6. 转换回RGB格式（Tesseract可以直接使用灰度图，但RGB也可以）
        if len(enhanced.shape) == 2:
            crop_img_processed = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2RGB)
        else:
            crop_img_processed = enhanced
        
        text_block = {
            'index': i + 1,
            'bbox': {
                'x1': x1,
                'y1': y1,
                'x2': x2,
                'y2': y2,
                'width': x2 - x1,
                'height': y2 - y1,
                'center_x': (x1 + x2) / 2,
                'center_y': (y1 + y2) / 2
            }
        }
        
        try:
            if ocr_engine == 'paddleocr':
                # 使用PaddleOCR识别
                try:
                    # PaddleOCR返回格式: [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], (text, confidence), ...]
                    ocr_result = paddleocr_instance.ocr(crop_img_processed)
                    
                    # 调试：打印OCR结果格式
                    if ocr_result:
                        print(f"  [DEBUG] 第 {i+1} 个区域: ocr_result类型={type(ocr_result)}, 长度={len(ocr_result) if isinstance(ocr_result, (list, tuple)) else 'N/A'}")
                        if len(ocr_result) > 0:
                            result_item = ocr_result[0]
                            print(f"  [DEBUG] 第 {i+1} 个区域: ocr_result[0]类型={type(result_item)}")
                            # 检查OCRResult对象的属性
                            if hasattr(result_item, '__dict__'):
                                print(f"  [DEBUG] 第 {i+1} 个区域: OCRResult属性={list(result_item.__dict__.keys())}")
                            # 尝试转换为列表或字典
                            try:
                                if hasattr(result_item, 'text_lines') or hasattr(result_item, 'texts'):
                                    print(f"  [DEBUG] 第 {i+1} 个区域: 尝试访问text_lines或texts属性")
                            except:
                                pass
                    
                    if ocr_result and len(ocr_result) > 0:
                        # PaddleOCR 3.x 返回的是 OCRResult 对象
                        result_item = ocr_result[0]
                        
                        # OCRResult 对象有 json 属性，返回字典格式
                        # 结构: {'res': {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...], 'rec_boxes': [...]}}
                        try:
                            result_json = result_item.json
                            res_data = result_json.get('res', {}) if isinstance(result_json, dict) else {}
                            
                            # 提取文本、置信度、坐标
                            rec_texts = res_data.get('rec_texts', [])
                            rec_scores = res_data.get('rec_scores', [])
                            rec_polys = res_data.get('rec_polys', [])  # 多边形坐标 [[[x1,y1],[x2,y2],[x3,y3],[x4,y4]], ...]
                            rec_boxes = res_data.get('rec_boxes', [])  # 边界框 [[x1,y1,x2,y2], ...]
                            
                            if not rec_texts:
                                print(f"  [DEBUG] 第 {i+1} 个区域: PaddleOCR未识别到文字")
                                continue
                            
                            text_lines_with_bbox = []
                            all_texts = []
                            all_char_boxes_list = []
                            
                            # 关键改进：先收集所有文本行和它们的边界框，然后对整个文本区域进行字符检测
                            # 这样可以确保OpenCV检测到所有字符，而不仅仅是单个文本行的字符
                            
                            # 解析PaddleOCR结果，收集所有文本行
                            all_text_lines = []  # 存储所有文本行及其边界框
                            for idx, text in enumerate(rec_texts):
                                if not text or not text.strip():
                                    continue
                                
                                # 获取置信度
                                confidence = float(rec_scores[idx]) if idx < len(rec_scores) else 0.9
                                
                                # 获取坐标（优先使用多边形坐标，如果没有则使用边界框）
                                if idx < len(rec_polys) and rec_polys[idx]:
                                    bbox_coords = rec_polys[idx]  # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
                                elif idx < len(rec_boxes) and rec_boxes[idx]:
                                    # 将边界框转换为多边形格式
                                    box = rec_boxes[idx]  # [x1, y1, x2, y2]
                                    bbox_coords = [
                                        [box[0], box[1]],  # 左上
                                        [box[2], box[1]],  # 右上
                                        [box[2], box[3]],  # 右下
                                        [box[0], box[3]]   # 左下
                                    ]
                                else:
                                    print(f"  [DEBUG] 第 {i+1} 个区域: 第 {idx} 个文本没有坐标信息")
                                    continue
                                
                                if text and text.strip():
                                    # 计算边界框
                                    # 检查 bbox_coords 格式
                                    if not isinstance(bbox_coords, (list, tuple)) or len(bbox_coords) < 4:
                                        print(f"  [DEBUG] 第 {i+1} 个区域: bbox_coords 格式不正确: {type(bbox_coords)}, {bbox_coords}")
                                        continue
                                    
                                    # 检查每个坐标点格式
                                    try:
                                        x_coords = []
                                        y_coords = []
                                        for coord in bbox_coords:
                                            if isinstance(coord, (list, tuple)) and len(coord) >= 2:
                                                x_coords.append(coord[0])
                                                y_coords.append(coord[1])
                                            else:
                                                print(f"  [DEBUG] 第 {i+1} 个区域: 坐标点格式不正确: {coord}")
                                                break
                                        
                                        if not x_coords or not y_coords or len(x_coords) < 4:
                                            print(f"  [DEBUG] 第 {i+1} 个区域: 无法提取足够的坐标点")
                                            continue
                                    except (TypeError, IndexError) as e:
                                        print(f"  [DEBUG] 第 {i+1} 个区域: 解析坐标失败: {e}, bbox_coords={bbox_coords}")
                                        continue
                                    left = min(x_coords)
                                    top = min(y_coords)
                                    right = max(x_coords)
                                    bottom = max(y_coords)
                                    
                                    # 转换为绝对坐标（相对于原图）
                                    char_bbox = {
                                        'x1': float(x1 + left),
                                        'y1': float(y1 + top),
                                        'x2': float(x1 + right),
                                        'y2': float(y1 + bottom),
                                        'center_x': float(x1 + (left + right) / 2),
                                        'center_y': float(y1 + (top + bottom) / 2)
                                    }
                                    
                                    text_lines_with_bbox.append({
                                        'text': text,
                                        'bbox': char_bbox,
                                        'confidence': confidence
                                    })
                                    all_texts.append((text, confidence))
                                    
                                    # 收集文本行信息，稍后统一处理
                                    all_text_lines.append({
                                        'text': text,
                                        'bbox': char_bbox,
                                        'confidence': confidence
                                    })
                            
                            # 关键改进：对所有文本行合并后的整个区域进行字符检测
                            if all_text_lines:
                                # 计算整个文本区域的边界框（包含所有文本行）
                                all_x1 = [line['bbox']['x1'] for line in all_text_lines]
                                all_y1 = [line['bbox']['y1'] for line in all_text_lines]
                                all_x2 = [line['bbox']['x2'] for line in all_text_lines]
                                all_y2 = [line['bbox']['y2'] for line in all_text_lines]
                                
                                combined_bbox = {
                                    'x1': float(min(all_x1)),
                                    'y1': float(min(all_y1)),
                                    'x2': float(max(all_x2)),
                                    'y2': float(max(all_y2)),
                                    'center_x': float((min(all_x1) + max(all_x2)) / 2),
                                    'center_y': float((min(all_y1) + max(all_y2)) / 2)
                                }
                                
                                # 合并所有文本行的文本
                                combined_text_for_detection = ''.join([line['text'] for line in all_text_lines])
                                
                                # 使用OpenCV检测整个文本区域的所有字符
                                # 注意：text_bbox_for_detection必须使用绝对坐标（相对于原图）
                                # 因为detect_characters_with_opencv函数期望的是原图坐标
                                text_bbox_for_detection = {
                                    'x1': combined_bbox['x1'],
                                    'y1': combined_bbox['y1'],
                                    'x2': combined_bbox['x2'],
                                    'y2': combined_bbox['y2']
                                }
                                
                                # 使用OpenCV检测字符位置（需要传入原图img，而不是crop_img）
                                # 注意：坐标是相对于原图的，所以需要传入原图
                                # 传入OCR的边界框作为参考，提高识别率
                                detected_char_boxes = detect_characters_with_opencv(
                                    img, text_bbox_for_detection, combined_text_for_detection, ocr_bbox_hint=combined_bbox
                                )
                                
                                # 调试输出：检查OpenCV是否识别出所有字符
                                if '远道' in combined_text_for_detection or '石田' in combined_text_for_detection:
                                    print(f"  [DEBUG] 合并后OCR文本: {combined_text_for_detection}")
                                    text_no_space_debug = combined_text_for_detection.replace(' ', '')
                                    print(f"  [DEBUG] 去除空格后: {text_no_space_debug}, 字符数: {len(text_no_space_debug)}")
                                    print(f"  [DEBUG] OpenCV检测到的字符框数: {len(detected_char_boxes)}")
                                    if len(detected_char_boxes) > 0:
                                        print(f"  [DEBUG] 前3个字符框位置: center_x={[b['center_x'] for b in detected_char_boxes[:3]]}, center_y={[b['center_y'] for b in detected_char_boxes[:3]]}")
                                
                                text_no_space = combined_text_for_detection.replace(' ', '')
                                
                                if len(detected_char_boxes) > 0 and len(detected_char_boxes) == len(text_no_space):
                                    # 使用OpenCV检测到的精确位置
                                    # 关键：OpenCV检测的字符框顺序可能与OCR文本顺序不一致
                                    # 需要根据字符框的位置来匹配字符，而不是简单地按索引对应
                                    
                                    # 方法1：将字符框按位置排序（在OpenCV坐标系中：从上到下、从右到左）
                                    # 注意：detect_characters_with_opencv函数返回的字符框可能已经按某种顺序排列
                                    # 但我们需要确保按照正确的阅读顺序（从上到下、从右到左）排序
                                    sorted_char_boxes = sorted(detected_char_boxes, key=lambda b: (b['y1'], -b['center_x']))
                                    
                                    # 反转文本字符，使其与字符框的位置顺序对应
                                    reversed_text_chars = list(text_no_space[::-1])
                                    
                                    # 将排序后的字符框与反转后的文本字符对应
                                    for k, char_box in enumerate(sorted_char_boxes):
                                        char = reversed_text_chars[k] if k < len(reversed_text_chars) else '?'
                                        all_char_boxes_list.append({
                                            'char': char,
                                            'x1': char_box['x1'],
                                            'y1': char_box['y1'],
                                            'x2': char_box['x2'],
                                            'y2': char_box['y2'],
                                            'center_x': char_box['center_x'],
                                            'center_y': char_box['center_y']
                                        })
                                    else:
                                        # 如果OpenCV检测失败，回退到估算方法
                                        if len(text_no_space) > 0:
                                            bbox_width = right - left
                                            bbox_height = bottom - top
                                            is_vertical = bbox_height > bbox_width * 1.2
                                            
                                            if is_vertical:
                                                # 竖排：字符从上到下（y坐标从小到大）
                                                char_height = bbox_height / len(text_no_space)
                                                for k, char in enumerate(text_no_space):
                                                    char_x = char_bbox['center_x']
                                                    char_y = char_bbox['y1'] + char_height * (k + 0.5)
                                                    all_char_boxes_list.append({
                                                        'char': char,
                                                        'x1': char_x - 5,
                                                        'y1': char_y - char_height/2,
                                                        'x2': char_x + 5,
                                                        'y2': char_y + char_height/2,
                                                        'center_x': char_x,
                                                        'center_y': char_y
                                                    })
                                            else:
                                                # 横排：字符从左到右（估算）
                                                char_width = bbox_width / len(text_no_space)
                                                for k, char in enumerate(text_no_space):
                                                    char_x = char_bbox['x1'] + char_width * (k + 0.5)
                                                    char_y = char_bbox['center_y']
                                                    all_char_boxes_list.append({
                                                        'char': char,
                                                        'x1': char_x - char_width/2,
                                                        'y1': char_y - 5,
                                                        'x2': char_x + char_width/2,
                                                        'y2': char_y + 5,
                                                        'center_x': char_x,
                                                        'center_y': char_y
                                                    })
                            
                            # 合并所有文字
                            if all_texts:
                                # 先对文字行进行排序（从右到左、从上到下）
                                # 注意：对于日式漫画，阅读顺序是从右到左、从上到下
                                # 排序规则：先按Y坐标从上到下（y1越小越靠上），然后按X坐标从右到左（center_x越大越靠右）
                                if len(text_lines_with_bbox) > 1:
                                    text_lines_with_bbox.sort(key=lambda line: (line['bbox']['y1'], -line['bbox']['center_x']))
                                
                                # 从排序后的text_lines_with_bbox中提取文本
                                text_lines = [line['text'] for line in text_lines_with_bbox]
                                combined_text = ' '.join(text_lines)
                                avg_confidence = sum([t[1] for t in all_texts]) / len(all_texts) if all_texts else 0.0
                                
                                # 使用字符位置信息
                                character_positions = []
                                if all_char_boxes_list and len(all_char_boxes_list) > 0:
                                    # 获取图片高度（用于坐标转换）
                                    img_height = img.shape[0]
                                    
                                    # 注意：字符框已经在前面按位置排序并与文本字符对应了
                                    # 这里不需要再次排序，保持字符与坐标的对应关系
                                    # 直接使用all_char_boxes_list，保持字符与坐标的对应关系
                                    for char_box in all_char_boxes_list:
                                        # 将坐标转换为数学坐标系（左下角为原点，向上为y轴正方向）
                                        # 转换中心坐标
                                        center_x_old = char_box['center_x']
                                        center_y_old = char_box['center_y']
                                        center_x_new, center_y_new = convert_coordinate_to_math_system(center_x_old, center_y_old, img_height)
                                        
                                        # 转换边界框坐标（用于更精确的位置信息）
                                        x1_old = char_box['x1']
                                        y1_old = char_box['y1']
                                        x2_old = char_box['x2']
                                        y2_old = char_box['y2']
                                        
                                        x1_new, y1_new = convert_coordinate_to_math_system(x1_old, y1_old, img_height)
                                        x2_new, y2_new = convert_coordinate_to_math_system(x2_old, y2_old, img_height)
                                        
                                        # 注意：在数学坐标系中，y1_new > y2_new（因为y1在原图中更靠上，转换后y值更大）
                                        # 所以需要确保y1是上边界（y值更大），y2是下边界（y值更小）
                                        y1_math = max(y1_new, y2_new)  # 上边界（y值更大）
                                        y2_math = min(y1_new, y2_new)  # 下边界（y值更小）
                                        
                                        character_positions.append({
                                            'x': center_x_new,  # 转换后的中心x坐标（数学坐标系）
                                            'y': center_y_new,  # 转换后的中心y坐标（数学坐标系）
                                            'center_x': center_x_new,  # 转换后的中心x坐标
                                            'center_y': center_y_new,  # 转换后的中心y坐标
                                            'x1': min(x1_new, x2_new),  # 转换后的左边界x坐标
                                            'y1': y1_math,  # 转换后的上边界y坐标（数学坐标系中y值更大）
                                            'x2': max(x1_new, x2_new),  # 转换后的右边界x坐标
                                            'y2': y2_math,  # 转换后的下边界y坐标（数学坐标系中y值更小）
                                            'x_old': center_x_old,  # 保留原始中心x坐标（用于调试）
                                            'y_old': center_y_old   # 保留原始中心y坐标（用于调试）
                                        })
                                
                                # 如果字符位置数量不匹配，清空
                                text_no_space_for_check = combined_text.replace(' ', '')
                                if len(character_positions) != len(text_no_space_for_check):
                                    if '远道' in combined_text or '石田' in combined_text:
                                        print(f"  [DEBUG] 字符位置数量不匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}, text=\"{combined_text}\"")
                                    character_positions = []
                                elif '远道' in combined_text or '石田' in combined_text:
                                    print(f"  [DEBUG] 字符位置数量匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}")
                                    print(f"  [DEBUG] 前3个character_positions: {[{'x': p.get('center_x', p.get('x', 0)), 'y': p.get('center_y', p.get('y', 0))} for p in character_positions[:3]]}")
                                
                                # 调试输出：检查character_positions
                                if ('远道' in combined_text or '石田' in combined_text) and character_positions:
                                    print(f"  [DEBUG] 保存到dialogues: text=\"{combined_text}\", character_positions数量={len(character_positions)}")
                                
                                if combined_text and combined_text.strip():
                                    dialogues.append({
                                        'order': i + 1,
                                        'text': combined_text,
                                        'bbox': text_block['bbox'],
                                        'confidence': avg_confidence,
                                        'character_positions': character_positions if character_positions else None
                                    })
                                    text_preview = combined_text[:30] + '...' if len(combined_text) > 30 else combined_text
                                    print(f"  [{i+1}/{len(blk_list)}] 识别: {text_preview} (置信度: {avg_confidence:.2f})")
                                else:
                                    print(f"  [DEBUG] 第 {i+1} 个区域: combined_text为空 (all_texts长度: {len(all_texts)})")
                            else:
                                print(f"  [DEBUG] 第 {i+1} 个区域未识别到文字 (all_texts为空)")
                        except Exception as e:
                            print(f"  [WARN] PaddleOCR解析第 {i+1} 个区域结果失败: {e}")
                            import traceback
                            traceback.print_exc()
                            continue
                except Exception as e:
                    print(f"  [WARN] PaddleOCR识别第 {i+1} 个区域失败: {e}")
                    import traceback
                    traceback.print_exc()
                    continue
            else:
                # 只使用PaddleOCR，如果失败则报错
                raise RuntimeError(f"OCR引擎不是PaddleOCR，当前引擎: {ocr_engine}")
        except Exception as e:
            print(f"  [WARN] 识别第 {i+1} 个区域失败: {e}")
            import traceback
            traceback.print_exc()
            continue
    
    print(f"[OK] 成功识别 {len(dialogues)} 段文字")
    
    # 步骤6: 按格子位置排序（越往上、越往右的格子里的对话顺序越靠前）
    print("[INFO] 步骤6: 按格子位置排序...")
    sorted_dialogues = sort_text_blocks_by_panels(dialogues, panels, im_w, im_h)
    
    # 重新分配order，保留order、text、bbox和character_positions字段
    formatted_dialogues = []
    for i, dialogue in enumerate(sorted_dialogues, 1):
        formatted_dialogues.append({
            'order': i,
            'text': dialogue['text'],
            'bbox': dialogue.get('bbox', {}),  # 保留bbox信息用于排序
            'character_positions': dialogue.get('character_positions')  # 保留字符位置信息用于字符排序
        })
    
    # 步骤7: 保存JSON结果到output_dir（ocr目录，最终结果）
    print("[INFO] 步骤7: 保存JSON结果...")
    result = {
        'image_file': image_path.name,
        'reading_order': '从右到左、从上到下（日式漫画阅读顺序）',
        'dialogues': formatted_dialogues,
        'total_count': len(formatted_dialogues)
    }
    
    # 保存JSON到output_dir（ocr目录，最终结果文件）
    json_path = output_dir / f"{image_name}_dialogues.json"
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print(f"[OK] 已保存对白结果: {json_path}")
    
    return result


def batch_detect_and_ocr(image_dir, model_path=None, output_dir=None):
    """
    批量处理目录下所有图片
    """
    image_dir = Path(image_dir)
    
    if not image_dir.exists():
        raise FileNotFoundError(f"图片目录不存在: {image_dir}")
    
    # 获取所有图片文件，按文件名数字排序
    image_files = []
    for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.webp']:
        image_files.extend(image_dir.glob(ext))
        image_files.extend(image_dir.glob(ext.upper()))
    
    # 按文件名开头的数字排序
    image_files = sorted(image_files, key=lambda x: int(x.stem.split('_')[0]) if x.stem.split('_')[0].isdigit() else 0)
    
    print(f"[INFO] 找到 {len(image_files)} 张图片")
    
    # 设置输出目录
    if output_dir is None:
        output_dir = image_dir / 'ocr'
    else:
        output_dir = Path(output_dir)
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    results = []
    for i, image_file in enumerate(image_files, 1):
        print(f"\n[{i}/{len(image_files)}] 处理: {image_file.name}")
        try:
            result = detect_and_ocr_comic(image_file, model_path, output_dir)
            results.append(result)
        except Exception as e:
            print(f"[ERROR] 处理 {image_file.name} 失败: {e}")
            import traceback
            traceback.print_exc()
            continue
    
    print(f"\n[OK] 批量处理完成，成功处理 {len(results)} 张图片")
    return results


if __name__ == '__main__':
    import argparse
    
    parser = argparse.ArgumentParser(description='检测漫画文字区域并用OCR识别')
    parser.add_argument('input', help='输入图片路径或目录')
    parser.add_argument('-o', '--output', help='输出目录')
    parser.add_argument('-m', '--model', help='comic-text-detector模型路径')
    
    args = parser.parse_args()
    
    input_path = Path(args.input)
    
    if input_path.is_file():
        detect_and_ocr_comic(input_path, args.model, args.output)
    elif input_path.is_dir():
        batch_detect_and_ocr(input_path, args.model, args.output)
    else:
        print(f"[ERROR] 输入路径不存在: {input_path}")
        sys.exit(1)