| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415 |
- # -*- coding: utf-8 -*-
- """
- 使用 comic-text-detector 检测文字区域,然后用 PaddleOCR 识别文字内容
- 按日式漫画阅读顺序(从右到左、从上到下)排序
- """
- import sys
- import os
- import json
- from pathlib import Path
- import cv2
- import numpy as np
- def convert_coordinate_to_math_system(x, y, image_height):
- """
- 将OpenCV坐标系转换为数学坐标系
-
- 原始坐标系(OpenCV):左上角(0,0),向下为y轴正方向,向右为x轴正方向
- 目标坐标系(数学):左下角(0,0),向上为y轴正方向,向右为x轴正方向
-
- 转换公式:
- - x_new = x_old (x坐标不变)
- - y_new = imageHeight - y_old (y坐标翻转)
-
- 参数:
- x: 原始x坐标
- y: 原始y坐标
- image_height: 图片高度
-
- 返回:
- (x_new, y_new): 转换后的坐标
- """
- x_new = x
- y_new = image_height - y
- return x_new, y_new
- def detect_characters_with_opencv(img, text_bbox, text_content, ocr_bbox_hint=None):
- """
- 使用OpenCV在文本区域内精确定位每个字符
-
- 参数:
- img: 原始图像(BGR格式)
- text_bbox: 文本边界框,格式 {'x1': int, 'y1': int, 'x2': int, 'y2': int}
- text_content: 文本内容(用于验证字符数量)
- ocr_bbox_hint: OCR提供的文本边界框(可选,用于辅助识别)
-
- 返回:
- char_boxes: 字符边界框列表,每个元素包含 {'x1', 'y1', 'x2', 'y2', 'center_x', 'center_y'}
- 确保字符数量与OCR文本一致,且字符框不重叠
- """
- # 提取文本区域
- x1 = int(text_bbox['x1'])
- y1 = int(text_bbox['y1'])
- x2 = int(text_bbox['x2'])
- y2 = int(text_bbox['y2'])
-
- # 确保坐标在图像范围内
- h, w = img.shape[:2]
- x1 = max(0, x1)
- y1 = max(0, y1)
- x2 = min(w, x2)
- y2 = min(h, y2)
-
- if x2 <= x1 or y2 <= y1:
- return []
-
- # 提取文本区域ROI
- text_roi = img[y1:y2, x1:x2].copy()
-
- if text_roi.size == 0:
- return []
-
- # 转换为灰度图
- if len(text_roi.shape) == 3:
- gray_roi = cv2.cvtColor(text_roi, cv2.COLOR_BGR2GRAY)
- else:
- gray_roi = text_roi
-
- # 二值化处理
- # 使用自适应阈值,因为文本区域可能有不同的光照条件
- binary = cv2.adaptiveThreshold(
- gray_roi, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
- cv2.THRESH_BINARY_INV, 11, 2
- )
-
- # 形态学操作:去除噪点,连接字符笔画
- kernel = np.ones((2, 2), np.uint8)
- binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
- binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel, iterations=1)
-
- # 判断是竖排还是横排
- roi_height, roi_width = binary.shape
- is_vertical = roi_height > roi_width * 1.2
-
- # 保存布局信息,用于后续估算
- layout_info = {'is_vertical': is_vertical, 'roi_width': roi_width, 'roi_height': roi_height}
-
- char_boxes = []
-
- if is_vertical:
- # 竖排文字:使用垂直投影来分割字符
- # 计算垂直投影(每列的白色像素数量)
- vertical_projection = np.sum(binary, axis=0)
-
- # 找到字符之间的空白列(投影值接近0)
- threshold = np.max(vertical_projection) * 0.1
- char_boundaries = []
-
- in_char = False
- start_col = 0
-
- for col in range(len(vertical_projection)):
- if vertical_projection[col] > threshold:
- if not in_char:
- in_char = True
- start_col = col
- else:
- if in_char:
- in_char = False
- # 字符结束位置(使用中间位置作为分割点)
- end_col = col
- char_boundaries.append((start_col, end_col))
-
- # 处理最后一个字符
- if in_char:
- char_boundaries.append((start_col, len(vertical_projection)))
-
- # 为每个字符区域计算水平边界
- for start_col, end_col in char_boundaries:
- char_col_roi = binary[:, start_col:end_col]
- horizontal_projection = np.sum(char_col_roi, axis=1)
-
- # 找到字符的上下边界
- char_rows = np.where(horizontal_projection > 0)[0]
- if len(char_rows) > 0:
- top_row = char_rows[0]
- bottom_row = char_rows[-1]
-
- # 转换为原图坐标
- char_x1 = x1 + start_col
- char_y1 = y1 + top_row
- char_x2 = x1 + end_col
- char_y2 = y1 + bottom_row
-
- char_boxes.append({
- 'x1': float(char_x1),
- 'y1': float(char_y1),
- 'x2': float(char_x2),
- 'y2': float(char_y2),
- 'center_x': float((char_x1 + char_x2) / 2),
- 'center_y': float((char_y1 + char_y2) / 2)
- })
- else:
- # 横排文字:使用水平投影来分割字符
- # 计算水平投影(每行的白色像素数量)
- horizontal_projection = np.sum(binary, axis=1)
-
- # 找到字符之间的空白行(投影值接近0)
- threshold = np.max(horizontal_projection) * 0.1
- char_boundaries = []
-
- in_char = False
- start_row = 0
-
- for row in range(len(horizontal_projection)):
- if horizontal_projection[row] > threshold:
- if not in_char:
- in_char = True
- start_row = row
- else:
- if in_char:
- in_char = False
- # 字符结束位置
- end_row = row
- char_boundaries.append((start_row, end_row))
-
- # 处理最后一个字符
- if in_char:
- char_boundaries.append((start_row, len(horizontal_projection)))
-
- # 为每个字符区域计算垂直边界
- for start_row, end_row in char_boundaries:
- char_row_roi = binary[start_row:end_row, :]
- vertical_projection = np.sum(char_row_roi, axis=0)
-
- # 找到字符的左右边界
- char_cols = np.where(vertical_projection > 0)[0]
- if len(char_cols) > 0:
- left_col = char_cols[0]
- right_col = char_cols[-1]
-
- # 转换为原图坐标
- char_x1 = x1 + left_col
- char_y1 = y1 + start_row
- char_x2 = x1 + right_col
- char_y2 = y1 + end_row
-
- char_boxes.append({
- 'x1': float(char_x1),
- 'y1': float(char_y1),
- 'x2': float(char_x2),
- 'y2': float(char_y2),
- 'center_x': float((char_x1 + char_x2) / 2),
- 'center_y': float((char_y1 + char_y2) / 2)
- })
-
- # 如果投影方法检测到的字符数量与文本内容不匹配,使用轮廓检测作为主要方法
- text_no_space = text_content.replace(' ', '')
- expected_char_count = len(text_no_space)
-
- # 使用轮廓检测作为主要方法(更精确)
- contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
- # 过滤掉太小的轮廓(可能是噪点)
- min_area = (roi_width * roi_height) / (expected_char_count * 20) # 更严格的面积阈值
- max_area = (roi_width * roi_height) / 2 # 最大面积(避免误检测)
-
- valid_contours = []
- for contour in contours:
- area = cv2.contourArea(contour)
- if min_area < area < max_area:
- # 计算轮廓的宽高比,过滤掉明显不是字符的轮廓
- x, y, w, h = cv2.boundingRect(contour)
- aspect_ratio = h / w if w > 0 else 0
- # 字符的宽高比通常在合理范围内
- if 0.2 < aspect_ratio < 5.0:
- valid_contours.append(contour)
-
- # 如果轮廓数量合理,使用轮廓结果(更精确)
- if len(valid_contours) > 0:
- # 按位置排序轮廓(从上到下、从右到左)
- contour_boxes = []
- for contour in valid_contours:
- x, y, w, h = cv2.boundingRect(contour)
- # 转换为原图坐标
- char_x1 = float(x1 + x)
- char_y1 = float(y1 + y)
- char_x2 = float(x1 + x + w)
- char_y2 = float(y1 + y + h)
-
- contour_boxes.append({
- 'x1': char_x1,
- 'y1': char_y1,
- 'x2': char_x2,
- 'y2': char_y2,
- 'center_x': float(char_x1 + w / 2),
- 'center_y': float(char_y1 + h / 2),
- 'area': area
- })
-
- # 按位置排序(从上到下、从右到左)
- contour_boxes.sort(key=lambda b: (b['y1'], -b['center_x']))
-
- # 如果轮廓数量接近预期,使用轮廓结果
- if abs(len(contour_boxes) - expected_char_count) <= abs(len(char_boxes) - expected_char_count):
- char_boxes = contour_boxes
-
- # 关键改进:确保字符数量与OCR文本一致,且字符框不重叠
- # 如果识别出的字符数量不匹配,使用OCR坐标作为参考来辅助识别
- if len(char_boxes) != expected_char_count and ocr_bbox_hint:
- # 使用OCR提供的边界框作为参考,估算字符位置
- char_boxes = refine_char_boxes_with_ocr_hint(
- img, text_bbox, text_content, char_boxes, ocr_bbox_hint, expected_char_count
- )
-
- # 确保字符框不重叠
- char_boxes = remove_overlapping_boxes(char_boxes, expected_char_count)
-
- # 如果字符数量仍然不匹配,使用估算方法
- if len(char_boxes) != expected_char_count:
- char_boxes = estimate_char_boxes_from_text_bbox(
- text_bbox, text_content, expected_char_count, is_vertical
- )
-
- return char_boxes
- def refine_char_boxes_with_ocr_hint(img, text_bbox, text_content, detected_boxes, ocr_bbox_hint, expected_count):
- """
- 使用OCR提供的边界框作为参考,改进字符检测
-
- 参数:
- img: 原始图像
- text_bbox: 文本边界框
- text_content: 文本内容
- detected_boxes: 已检测到的字符框列表
- ocr_bbox_hint: OCR提供的文本边界框
- expected_count: 期望的字符数量
-
- 返回:
- 改进后的字符框列表
- """
- # 如果已检测到的字符框数量接近期望值,直接返回
- if abs(len(detected_boxes) - expected_count) <= 2:
- return detected_boxes
-
- # 使用OCR边界框估算字符位置
- text_no_space = text_content.replace(' ', '')
- roi_width = text_bbox['x2'] - text_bbox['x1']
- roi_height = text_bbox['y2'] - text_bbox['y1']
- is_vertical = roi_height > roi_width * 1.2
-
- estimated_boxes = []
-
- if is_vertical:
- # 竖排:估算每个字符的位置
- # 估算列数和行数
- estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8)))
- estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols
-
- char_width = roi_width / estimated_cols
- char_height = roi_height / estimated_rows
-
- # 如果有已检测到的字符框,使用它们的位置来调整估算
- if len(detected_boxes) > 0:
- # 使用已检测到的字符框位置来调整估算
- for i in range(expected_count):
- col = i % estimated_cols
- row = i // estimated_cols
-
- est_x = text_bbox['x1'] + col * char_width + char_width / 2
- est_y = text_bbox['y1'] + row * char_height + char_height / 2
-
- # 找到最近的已检测字符框
- min_dist = float('inf')
- best_box = None
- for box in detected_boxes:
- dist = abs(box['center_x'] - est_x) + abs(box['center_y'] - est_y)
- if dist < min_dist:
- min_dist = dist
- best_box = box
-
- if best_box and min_dist < char_width:
- # 使用已检测到的字符框
- estimated_boxes.append(best_box)
- else:
- # 使用估算位置
- estimated_boxes.append({
- 'x1': float(est_x - char_width / 2),
- 'y1': float(est_y - char_height / 2),
- 'x2': float(est_x + char_width / 2),
- 'y2': float(est_y + char_height / 2),
- 'center_x': float(est_x),
- 'center_y': float(est_y)
- })
- else:
- # 完全使用估算
- for i in range(expected_count):
- col = i % estimated_cols
- row = i // estimated_cols
- est_x = text_bbox['x1'] + col * char_width + char_width / 2
- est_y = text_bbox['y1'] + row * char_height + char_height / 2
- estimated_boxes.append({
- 'x1': float(est_x - char_width / 2),
- 'y1': float(est_y - char_height / 2),
- 'x2': float(est_x + char_width / 2),
- 'y2': float(est_y + char_height / 2),
- 'center_x': float(est_x),
- 'center_y': float(est_y)
- })
- else:
- # 横排:估算每个字符的位置
- char_width = roi_width / expected_count
- char_height = roi_height
-
- for i in range(expected_count):
- x = text_bbox['x1'] + i * char_width + char_width / 2
- y = text_bbox['y1'] + roi_height / 2
-
- estimated_boxes.append({
- 'x1': float(x - char_width / 2),
- 'y1': float(y - char_height / 2),
- 'x2': float(x + char_width / 2),
- 'y2': float(y + char_height / 2),
- 'center_x': float(x),
- 'center_y': float(y)
- })
-
- return estimated_boxes[:expected_count]
- def remove_overlapping_boxes(char_boxes, expected_count):
- """
- 移除重叠的字符框,确保字符框不重叠
-
- 参数:
- char_boxes: 字符框列表
- expected_count: 期望的字符数量
-
- 返回:
- 去重后的字符框列表
- """
- if len(char_boxes) <= expected_count:
- return char_boxes
-
- # 按位置排序
- sorted_boxes = sorted(char_boxes, key=lambda b: (b['y1'], b['center_x']))
-
- # 移除重叠的字符框
- non_overlapping = []
- for box in sorted_boxes:
- is_overlapping = False
- for existing_box in non_overlapping:
- # 计算重叠面积
- overlap_x1 = max(box['x1'], existing_box['x1'])
- overlap_y1 = max(box['y1'], existing_box['y1'])
- overlap_x2 = min(box['x2'], existing_box['x2'])
- overlap_y2 = min(box['y2'], existing_box['y2'])
-
- if overlap_x2 > overlap_x1 and overlap_y2 > overlap_y1:
- overlap_area = (overlap_x2 - overlap_x1) * (overlap_y2 - overlap_y1)
- box_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1'])
- existing_area = (existing_box['x2'] - existing_box['x1']) * (existing_box['y2'] - existing_box['y1'])
-
- # 如果重叠面积超过较小框的50%,认为是重叠
- if overlap_area > min(box_area, existing_area) * 0.5:
- is_overlapping = True
- break
-
- if not is_overlapping:
- non_overlapping.append(box)
-
- # 如果去重后数量不足,尝试合并相近的字符框
- if len(non_overlapping) < expected_count:
- # 按位置分组,合并相近的字符框
- grouped = []
- for box in sorted_boxes:
- added = False
- for group in grouped:
- # 检查是否与组内任何框相近
- for group_box in group:
- dist = abs(box['center_x'] - group_box['center_x']) + abs(box['center_y'] - group_box['center_y'])
- if dist < 20: # 如果距离小于20像素,认为是同一个字符
- group.append(box)
- added = True
- break
- if added:
- break
-
- if not added:
- grouped.append([box])
-
- # 对每个组,选择最大的字符框
- non_overlapping = []
- for group in grouped:
- largest = max(group, key=lambda b: (b['x2'] - b['x1']) * (b['y2'] - b['y1']))
- non_overlapping.append(largest)
-
- return non_overlapping[:expected_count]
- def estimate_char_boxes_from_text_bbox(text_bbox, text_content, expected_count, is_vertical):
- """
- 从文本边界框估算字符位置(当OpenCV检测失败时使用)
-
- 参数:
- text_bbox: 文本边界框
- text_content: 文本内容
- expected_count: 期望的字符数量
- is_vertical: 是否为竖排
-
- 返回:
- 估算的字符框列表
- """
- text_no_space = text_content.replace(' ', '')
- roi_width = text_bbox['x2'] - text_bbox['x1']
- roi_height = text_bbox['y2'] - text_bbox['y1']
-
- estimated_boxes = []
-
- if is_vertical:
- # 竖排:估算每个字符的位置
- # 估算列数和行数
- estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8)))
- estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols
-
- char_width = roi_width / estimated_cols
- char_height = roi_height / estimated_rows
-
- for i in range(expected_count):
- col = i % estimated_cols
- row = i // estimated_cols
-
- x = text_bbox['x1'] + col * char_width + char_width / 2
- y = text_bbox['y1'] + row * char_height + char_height / 2
-
- estimated_boxes.append({
- 'x1': float(x - char_width / 2),
- 'y1': float(y - char_height / 2),
- 'x2': float(x + char_width / 2),
- 'y2': float(y + char_height / 2),
- 'center_x': float(x),
- 'center_y': float(y)
- })
- else:
- # 横排:估算每个字符的位置
- char_width = roi_width / expected_count
- char_height = roi_height
-
- for i in range(expected_count):
- x = text_bbox['x1'] + i * char_width + char_width / 2
- y = text_bbox['y1'] + roi_height / 2
-
- estimated_boxes.append({
- 'x1': float(x - char_width / 2),
- 'y1': float(y - char_height / 2),
- 'x2': float(x + char_width / 2),
- 'y2': float(y + char_height / 2),
- 'center_x': float(x),
- 'center_y': float(y)
- })
-
- return estimated_boxes
- # 禁用 oneDNN 以避免 NotImplementedError(PaddlePaddle 3.3.0 的已知问题)
- os.environ['FLAGS_onednn'] = '0'
- os.environ['FLAGS_use_mkldnn'] = '0'
- # Windows编码修复
- if sys.platform == 'win32':
- import io
- sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
- sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
- # 添加comic-text-detector路径
- project_root = Path(__file__).parent.parent
- comic_detector_path = project_root / 'comic-text-detector-master' / 'comic-text-detector-master'
- sys.path.insert(0, str(comic_detector_path))
- # 添加OnnxOCR本地路径(作为回退选项)
- onnxocr_path = project_root / 'OnnxOCR-main' / 'OnnxOCR-main'
- if onnxocr_path.exists():
- sys.path.insert(0, str(onnxocr_path))
- # 处理 wandb 可选依赖(comic-text-detector 需要但推理时不需要)
- try:
- import wandb
- except ImportError:
- # 创建一个假的 wandb 模块,避免导入错误
- class FakeWandb:
- @staticmethod
- def init(*args, **kwargs):
- return None
- @staticmethod
- def log(*args, **kwargs):
- pass
- @staticmethod
- def log_model(*args, **kwargs):
- pass
- sys.modules['wandb'] = FakeWandb()
- try:
- from inference import TextDetector, REFINEMASK_ANNOTATION
- from utils.io_utils import imread, imwrite
- except ImportError as e:
- print(f"[ERROR] 无法导入comic-text-detector模块: {e}")
- print(f"[INFO] 请确保已安装依赖: pip install torch torchvision opencv-python numpy tqdm")
- import traceback
- traceback.print_exc()
- sys.exit(1)
- # PaddleOCR(唯一使用)
- try:
- # 添加PaddleOCR路径
- paddleocr_path = project_root / 'PaddleOCR-main' / 'PaddleOCR-main'
- if paddleocr_path.exists():
- sys.path.insert(0, str(paddleocr_path))
- from paddleocr import PaddleOCR
- PADDLEOCR_AVAILABLE = True
- print("[INFO] PaddleOCR 可用")
- except ImportError as e:
- print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
- print("[ERROR] PaddleOCR 是必需的,请确保已正确安装")
- PADDLEOCR_AVAILABLE = False
- # 格子识别代码已移动到 python/generate-anim/detect_panels.py
- # 通过导入使用
- try:
- # 添加当前目录到路径,以便导入同目录下的模块
- import sys
- current_dir = Path(__file__).parent
- if str(current_dir) not in sys.path:
- sys.path.insert(0, str(current_dir))
- from detect_panels import detect_comic_panels, merge_panel_mask_with_text_mask
- except ImportError as e:
- print(f"[WARN] 无法导入detect_panels模块,使用本地实现: {e}")
- # 如果导入失败,使用本地实现(向后兼容)
- def detect_comic_panels(img):
- """使用opencv检测漫画格子(分镜框)- 本地实现"""
- if len(img.shape) == 3:
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- else:
- gray = img.copy()
-
- panel_mask = np.zeros_like(gray)
- edges = cv2.Canny(gray, 50, 150, apertureSize=3)
-
- horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
- horizontal_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, horizontal_kernel)
-
- vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
- vertical_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, vertical_kernel)
-
- lines_mask = cv2.bitwise_or(horizontal_lines, vertical_lines)
- lines = cv2.HoughLinesP(lines_mask, 1, np.pi/180, threshold=100,
- minLineLength=50, maxLineGap=10)
-
- if lines is not None:
- for line in lines:
- x1, y1, x2, y2 = line[0]
- cv2.line(panel_mask, (x1, y1), (x2, y2), 255, 2)
-
- kernel = np.ones((3, 3), np.uint8)
- dilated = cv2.dilate(lines_mask, kernel, iterations=2)
- contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-
- panels = []
- for contour in contours:
- x, y, w, h = cv2.boundingRect(contour)
- area = w * h
- if area > img.shape[0] * img.shape[1] * 0.01:
- cv2.rectangle(panel_mask, (x, y), (x + w, y + h), 255, 2)
- panels.append({
- 'x': x,
- 'y': y,
- 'width': w,
- 'height': h,
- 'center_x': x + w / 2,
- 'center_y': y + h / 2
- })
-
- return panel_mask, panels
-
- def merge_panel_mask_with_text_mask(panel_mask, text_mask):
- """合并格子遮罩图和文字mask图"""
- if panel_mask.shape != text_mask.shape:
- panel_mask = cv2.resize(panel_mask, (text_mask.shape[1], text_mask.shape[0]))
- return np.maximum(panel_mask, text_mask)
- def get_text_block_panel(text_block, panels):
- """
- 判断文字块属于哪个格子
-
- 参数:
- text_block: 文字块,包含bbox信息
- panels: 格子列表
-
- 返回:
- panel_index: 格子索引,如果不在任何格子内返回-1
- """
- bbox = text_block['bbox']
- center_x = (bbox['x1'] + bbox['x2']) / 2
- center_y = (bbox['y1'] + bbox['y2']) / 2
-
- for i, panel in enumerate(panels):
- if (panel['x'] <= center_x <= panel['x'] + panel['width'] and
- panel['y'] <= center_y <= panel['y'] + panel['height']):
- return i
-
- return -1
- def sort_text_blocks_by_panels(text_blocks, panels, image_width, image_height):
- """
- 按日式漫画阅读顺序排序:从右到左、从上到下(竖着读取)
-
- 排序规则:
- 1. 先按列分组(从右到左)- 越往右的列越靠前
- 2. 同一列内,按行排序(从上到下)- 越往上的行越靠前
- 3. 同一格子内,按X坐标从右到左
-
- 参数:
- text_blocks: 文字块列表,每个包含bbox信息
- panels: 格子列表
- image_width: 图片宽度
- image_height: 图片高度
-
- 返回:
- 排序后的文字块列表
- """
- if not text_blocks:
- return []
-
- # 计算每个文字块的中心点和所属格子
- for block in text_blocks:
- bbox = block['bbox']
- block['center_x'] = (bbox['x1'] + bbox['x2']) / 2
- block['center_y'] = (bbox['y1'] + bbox['y2']) / 2
- block['panel_index'] = get_text_block_panel(block, panels)
-
- # 排序规则(日式漫画:从右到左、从上到下竖着读取):
- # 1. 先按X坐标分组(从右到左)- X坐标越大(越靠右)越靠前
- # 2. 同一列内,按Y坐标排序(从上到下)- Y坐标越小(越往上)越靠前
- # 3. 同一位置,按X坐标从右到左
-
- # 将图片分成列(从右到左)
- # 使用图片宽度的20%作为列的分组阈值(更宽松的分组)
- column_threshold = max(image_width * 0.2, 100) # 至少100像素
-
- def sort_key(block):
- # 直接使用文字块的中心坐标,不依赖格子
- center_x = block['center_x']
- center_y = block['center_y']
-
- # 计算列号(从右到左,列号越小越靠右)
- # 将X坐标转换为列号:X坐标越大,列号越小(越靠右)
- # 使用 image_width - center_x 来计算距离右边的距离
- distance_from_right = image_width - center_x
- column = int(distance_from_right / column_threshold)
-
- # 使用列号和Y坐标作为主要排序依据
- # 列号越小(越靠右)越靠前,Y坐标越小(越往上)越靠前
- # 同一列同一行内,X坐标越大(越靠右)越靠前
- return (column, center_y, -center_x)
-
- sorted_blocks = sorted(text_blocks, key=sort_key)
-
- return sorted_blocks
- def detect_and_ocr_comic(image_path, model_path=None, output_dir=None):
- """
- 检测漫画文字区域并用OCR识别
-
- 参数:
- image_path: 图片路径
- model_path: comic-text-detector模型路径
- output_dir: 输出目录
- """
- image_path = Path(image_path)
-
- if not image_path.exists():
- raise FileNotFoundError(f"图片文件不存在: {image_path}")
-
- print(f"📖 正在处理图片: {image_path.name}")
-
- # 设置模型路径
- if model_path is None:
- possible_paths = [
- comic_detector_path / 'data' / 'comictextdetector.pt',
- comic_detector_path / 'data' / 'comictextdetector.pt.onnx',
- ]
- model_path = None
- for path in possible_paths:
- if path.exists():
- model_path = path
- break
-
- if model_path is None:
- raise FileNotFoundError(
- f"未找到comic-text-detector模型文件。请下载模型并放到以下位置之一:\n" +
- "\n".join([f" - {p}" for p in possible_paths])
- )
-
- # 设置输出目录
- if output_dir is None:
- output_dir = image_path.parent
- else:
- output_dir = Path(output_dir)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- # 创建tmp子目录用于保存中间处理文件
- tmp_dir = output_dir / 'tmp'
- tmp_dir.mkdir(parents=True, exist_ok=True)
-
- # 初始化comic-text-detector
- device = 'cuda' if __import__('torch').cuda.is_available() else 'cpu'
- print(f"[INFO] 使用设备: {device}")
-
- try:
- detector = TextDetector(
- model_path=str(model_path),
- input_size=1024,
- device=device,
- act='leaky'
- )
- except Exception as e:
- print(f"[ERROR] 初始化检测器失败: {e}")
- raise
-
- # 初始化PaddleOCR(唯一使用)
- print("[INFO] 初始化PaddleOCR...")
- ocr_engine = None
- paddleocr_instance = None
-
- if not PADDLEOCR_AVAILABLE:
- raise RuntimeError("PaddleOCR 不可用,请确保已正确安装 paddlex[ocr-core]")
-
- try:
- # 初始化PaddleOCR,使用中文模型
- # enable_mkldnn=False 禁用 MKL-DNN 以避免 NotImplementedError
- # use_angle_cls=True 启用角度分类器,可以更好地识别竖排文字
- paddleocr_instance = PaddleOCR(
- use_angle_cls=True, # 启用角度分类器,支持竖排文字识别
- lang='ch', # 中文
- enable_mkldnn=False # 禁用 MKL-DNN 以避免 oneDNN 错误
- )
- ocr_engine = 'paddleocr'
- print("[INFO] PaddleOCR 初始化成功")
- except Exception as e:
- print(f"[ERROR] PaddleOCR初始化失败: {e}")
- raise RuntimeError(f"PaddleOCR 初始化失败: {e}")
-
- # 读取图片
- img = imread(str(image_path))
- if img is None:
- raise ValueError(f"无法读取图片文件: {image_path}")
-
- im_h, im_w = img.shape[:2]
- print(f"[INFO] 图片尺寸: {im_w}x{im_h}")
-
- image_name = image_path.stem
-
- # 步骤1: 使用comic-text-detector检测文字区域(先检测文字块,用于辅助格子检测)
- print("[INFO] 步骤1: 检测文字区域...")
- try:
- mask, mask_refined, blk_list = detector(
- img,
- refine_mode=REFINEMASK_ANNOTATION,
- keep_undetected_mask=True
- )
- except Exception as e:
- print(f"[ERROR] 检测失败: {e}")
- raise
-
- print(f"[OK] 检测到 {len(blk_list)} 个文字区域")
-
- # 步骤2: 使用文字遮罩图和文字块信息辅助检测漫画格子
- print("[INFO] 步骤2: 检测漫画格子(使用文字遮罩图和文字块信息辅助)...")
-
- # 将文字块转换为统一格式
- text_blocks = []
- for blk in blk_list:
- x1, y1, x2, y2 = blk.xyxy
- text_blocks.append({
- 'xyxy': [int(x1), int(y1), int(x2), int(y2)]
- })
-
- # 使用文字遮罩图和文字块信息检测格子(优先使用文字遮罩图)
- panel_mask, panels = detect_comic_panels(img, text_blocks=text_blocks, text_mask=mask_refined)
- print(f"[OK] 检测到 {len(panels)} 个格子")
-
- # 如果检测到的格子太少,尝试不使用辅助信息重新检测
- if len(panels) < 4:
- print(f"[WARN] 检测到的格子数量较少({len(panels)}个),尝试使用传统方法重新检测...")
- panel_mask_fallback, panels_fallback = detect_comic_panels(img, text_blocks=None, text_mask=None)
- if len(panels_fallback) > len(panels):
- panel_mask = panel_mask_fallback
- panels = panels_fallback
- print(f"[OK] 使用传统方法检测到 {len(panels)} 个格子")
-
- # 保存格子遮罩图到tmp目录(中间文件)
- panel_mask_path = tmp_dir / f"{image_name}_panel_mask.png"
- imwrite(str(panel_mask_path), panel_mask)
- print(f"[OK] 已保存格子遮罩图: {panel_mask_path}")
-
- # 保存格子信息JSON到tmp目录(中间文件)
- panels_json = {
- 'image_file': image_path.name,
- 'panels': panels,
- 'total_count': len(panels)
- }
- panels_json_path = tmp_dir / f"{image_name}_panels.json"
- with open(panels_json_path, 'w', encoding='utf-8') as f:
- json.dump(panels_json, f, ensure_ascii=False, indent=2)
- print(f"[OK] 已保存格子信息: {panels_json_path}")
-
- # 保存原始文字遮罩图到tmp目录(中间文件)
- text_mask_path = tmp_dir / f"{image_name}_text_mask.png"
- imwrite(str(text_mask_path), mask_refined)
- print(f"[OK] 已保存文字遮罩图: {text_mask_path}")
-
- # 步骤3: 合并格子遮罩图和文字mask图
- print("[INFO] 步骤3: 合并格子遮罩图和文字mask图...")
- combined_mask = merge_panel_mask_with_text_mask(panel_mask, mask_refined)
-
- # 保存合并后的mask图片到tmp目录(中间文件)
- combined_mask_path = tmp_dir / f"{image_name}_combined_mask.png"
- print(f"[INFO] 步骤4: 保存合并后的mask图片到磁盘...")
- imwrite(str(combined_mask_path), combined_mask)
- print(f"[OK] 已保存合并后的mask图片: {combined_mask_path}")
-
- # 确认文件已生成
- if not text_mask_path.exists():
- raise FileNotFoundError(f"文字遮罩图文件未成功生成: {text_mask_path}")
- print(f"[OK] 已确认文字遮罩图文件存在")
-
- # 步骤5: 从保存的mask文件中读取,裁剪每个文字区域,然后识别
- print(f"[INFO] 步骤5: 从mask文件中读取并识别 {len(blk_list)} 个文字区域...")
-
- # 使用合并后的mask(已经在内存中,不需要重新读取)
- mask_img = combined_mask
-
- dialogues = []
-
- for i, blk in enumerate(blk_list):
- x1, y1, x2, y2 = blk.xyxy
-
- # 确保坐标在图片范围内
- x1 = max(0, int(x1))
- y1 = max(0, int(y1))
- x2 = min(im_w, int(x2))
- y2 = min(im_h, int(y2))
-
- # 从mask图片中裁剪对应的文字区域
- crop_mask = mask_img[y1:y2, x1:x2]
-
- if crop_mask.size == 0:
- continue
-
- # 同时从原图中裁剪对应的文字区域(用于OCR识别,效果更好)
- crop_img = img[y1:y2, x1:x2]
-
- # 确保是RGB格式(Tesseract可以直接使用,但统一使用RGB格式)
- if len(crop_img.shape) == 2:
- # 如果是灰度图,转换为RGB
- crop_img = cv2.cvtColor(crop_img, cv2.COLOR_GRAY2RGB)
- elif len(crop_img.shape) == 3 and crop_img.shape[2] == 4:
- # 如果是RGBA,转换为RGB
- crop_img = cv2.cvtColor(crop_img, cv2.COLOR_RGBA2RGB)
-
- # 对图片进行预处理以提高OCR识别率(保守处理,避免过度处理)
- # 1. 转换为灰度图
- if len(crop_img.shape) == 3:
- gray = cv2.cvtColor(crop_img, cv2.COLOR_RGB2GRAY)
- else:
- gray = crop_img
-
- # 2. 检测是否为黑底白字(黑白漫画)
- # 计算图片的平均亮度
- mean_brightness = np.mean(gray)
- is_dark_background = mean_brightness < 127 # 如果平均亮度小于127,可能是黑底
-
- # 如果是黑底白字,先反转颜色(OCR模型通常训练在白底黑字上)
- if is_dark_background:
- gray = cv2.bitwise_not(gray)
-
- # 3. 适度放大图片(仅对很小的文字区域)
- h, w = gray.shape[:2]
- if h < 32 or w < 32: # 只有很小的文字区域才放大
- scale = 2.0
- new_h, new_w = int(h * scale), int(w * scale)
- gray = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
-
- # 4. 增强对比度(使用CLAHE,保守设置)
- clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
- enhanced = clahe.apply(gray)
-
- # 5. 轻度去噪处理(避免过度模糊)
- enhanced = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
-
- # 6. 转换回RGB格式(Tesseract可以直接使用灰度图,但RGB也可以)
- if len(enhanced.shape) == 2:
- crop_img_processed = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2RGB)
- else:
- crop_img_processed = enhanced
-
- text_block = {
- 'index': i + 1,
- 'bbox': {
- 'x1': x1,
- 'y1': y1,
- 'x2': x2,
- 'y2': y2,
- 'width': x2 - x1,
- 'height': y2 - y1,
- 'center_x': (x1 + x2) / 2,
- 'center_y': (y1 + y2) / 2
- }
- }
-
- try:
- if ocr_engine == 'paddleocr':
- # 使用PaddleOCR识别
- try:
- # PaddleOCR返回格式: [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], (text, confidence), ...]
- ocr_result = paddleocr_instance.ocr(crop_img_processed)
-
- # 调试:打印OCR结果格式
- if ocr_result:
- print(f" [DEBUG] 第 {i+1} 个区域: ocr_result类型={type(ocr_result)}, 长度={len(ocr_result) if isinstance(ocr_result, (list, tuple)) else 'N/A'}")
- if len(ocr_result) > 0:
- result_item = ocr_result[0]
- print(f" [DEBUG] 第 {i+1} 个区域: ocr_result[0]类型={type(result_item)}")
- # 检查OCRResult对象的属性
- if hasattr(result_item, '__dict__'):
- print(f" [DEBUG] 第 {i+1} 个区域: OCRResult属性={list(result_item.__dict__.keys())}")
- # 尝试转换为列表或字典
- try:
- if hasattr(result_item, 'text_lines') or hasattr(result_item, 'texts'):
- print(f" [DEBUG] 第 {i+1} 个区域: 尝试访问text_lines或texts属性")
- except:
- pass
-
- if ocr_result and len(ocr_result) > 0:
- # PaddleOCR 3.x 返回的是 OCRResult 对象
- result_item = ocr_result[0]
-
- # OCRResult 对象有 json 属性,返回字典格式
- # 结构: {'res': {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...], 'rec_boxes': [...]}}
- try:
- result_json = result_item.json
- res_data = result_json.get('res', {}) if isinstance(result_json, dict) else {}
-
- # 提取文本、置信度、坐标
- rec_texts = res_data.get('rec_texts', [])
- rec_scores = res_data.get('rec_scores', [])
- rec_polys = res_data.get('rec_polys', []) # 多边形坐标 [[[x1,y1],[x2,y2],[x3,y3],[x4,y4]], ...]
- rec_boxes = res_data.get('rec_boxes', []) # 边界框 [[x1,y1,x2,y2], ...]
-
- if not rec_texts:
- print(f" [DEBUG] 第 {i+1} 个区域: PaddleOCR未识别到文字")
- continue
-
- text_lines_with_bbox = []
- all_texts = []
- all_char_boxes_list = []
-
- # 关键改进:先收集所有文本行和它们的边界框,然后对整个文本区域进行字符检测
- # 这样可以确保OpenCV检测到所有字符,而不仅仅是单个文本行的字符
-
- # 解析PaddleOCR结果,收集所有文本行
- all_text_lines = [] # 存储所有文本行及其边界框
- for idx, text in enumerate(rec_texts):
- if not text or not text.strip():
- continue
-
- # 获取置信度
- confidence = float(rec_scores[idx]) if idx < len(rec_scores) else 0.9
-
- # 获取坐标(优先使用多边形坐标,如果没有则使用边界框)
- if idx < len(rec_polys) and rec_polys[idx]:
- bbox_coords = rec_polys[idx] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
- elif idx < len(rec_boxes) and rec_boxes[idx]:
- # 将边界框转换为多边形格式
- box = rec_boxes[idx] # [x1, y1, x2, y2]
- bbox_coords = [
- [box[0], box[1]], # 左上
- [box[2], box[1]], # 右上
- [box[2], box[3]], # 右下
- [box[0], box[3]] # 左下
- ]
- else:
- print(f" [DEBUG] 第 {i+1} 个区域: 第 {idx} 个文本没有坐标信息")
- continue
-
- if text and text.strip():
- # 计算边界框
- # 检查 bbox_coords 格式
- if not isinstance(bbox_coords, (list, tuple)) or len(bbox_coords) < 4:
- print(f" [DEBUG] 第 {i+1} 个区域: bbox_coords 格式不正确: {type(bbox_coords)}, {bbox_coords}")
- continue
-
- # 检查每个坐标点格式
- try:
- x_coords = []
- y_coords = []
- for coord in bbox_coords:
- if isinstance(coord, (list, tuple)) and len(coord) >= 2:
- x_coords.append(coord[0])
- y_coords.append(coord[1])
- else:
- print(f" [DEBUG] 第 {i+1} 个区域: 坐标点格式不正确: {coord}")
- break
-
- if not x_coords or not y_coords or len(x_coords) < 4:
- print(f" [DEBUG] 第 {i+1} 个区域: 无法提取足够的坐标点")
- continue
- except (TypeError, IndexError) as e:
- print(f" [DEBUG] 第 {i+1} 个区域: 解析坐标失败: {e}, bbox_coords={bbox_coords}")
- continue
- left = min(x_coords)
- top = min(y_coords)
- right = max(x_coords)
- bottom = max(y_coords)
-
- # 转换为绝对坐标(相对于原图)
- char_bbox = {
- 'x1': float(x1 + left),
- 'y1': float(y1 + top),
- 'x2': float(x1 + right),
- 'y2': float(y1 + bottom),
- 'center_x': float(x1 + (left + right) / 2),
- 'center_y': float(y1 + (top + bottom) / 2)
- }
-
- text_lines_with_bbox.append({
- 'text': text,
- 'bbox': char_bbox,
- 'confidence': confidence
- })
- all_texts.append((text, confidence))
-
- # 收集文本行信息,稍后统一处理
- all_text_lines.append({
- 'text': text,
- 'bbox': char_bbox,
- 'confidence': confidence
- })
-
- # 关键改进:对所有文本行合并后的整个区域进行字符检测
- if all_text_lines:
- # 计算整个文本区域的边界框(包含所有文本行)
- all_x1 = [line['bbox']['x1'] for line in all_text_lines]
- all_y1 = [line['bbox']['y1'] for line in all_text_lines]
- all_x2 = [line['bbox']['x2'] for line in all_text_lines]
- all_y2 = [line['bbox']['y2'] for line in all_text_lines]
-
- combined_bbox = {
- 'x1': float(min(all_x1)),
- 'y1': float(min(all_y1)),
- 'x2': float(max(all_x2)),
- 'y2': float(max(all_y2)),
- 'center_x': float((min(all_x1) + max(all_x2)) / 2),
- 'center_y': float((min(all_y1) + max(all_y2)) / 2)
- }
-
- # 合并所有文本行的文本
- combined_text_for_detection = ''.join([line['text'] for line in all_text_lines])
-
- # 使用OpenCV检测整个文本区域的所有字符
- # 注意:text_bbox_for_detection必须使用绝对坐标(相对于原图)
- # 因为detect_characters_with_opencv函数期望的是原图坐标
- text_bbox_for_detection = {
- 'x1': combined_bbox['x1'],
- 'y1': combined_bbox['y1'],
- 'x2': combined_bbox['x2'],
- 'y2': combined_bbox['y2']
- }
-
- # 使用OpenCV检测字符位置(需要传入原图img,而不是crop_img)
- # 注意:坐标是相对于原图的,所以需要传入原图
- # 传入OCR的边界框作为参考,提高识别率
- detected_char_boxes = detect_characters_with_opencv(
- img, text_bbox_for_detection, combined_text_for_detection, ocr_bbox_hint=combined_bbox
- )
-
- # 调试输出:检查OpenCV是否识别出所有字符
- if '远道' in combined_text_for_detection or '石田' in combined_text_for_detection:
- print(f" [DEBUG] 合并后OCR文本: {combined_text_for_detection}")
- text_no_space_debug = combined_text_for_detection.replace(' ', '')
- print(f" [DEBUG] 去除空格后: {text_no_space_debug}, 字符数: {len(text_no_space_debug)}")
- print(f" [DEBUG] OpenCV检测到的字符框数: {len(detected_char_boxes)}")
- if len(detected_char_boxes) > 0:
- print(f" [DEBUG] 前3个字符框位置: center_x={[b['center_x'] for b in detected_char_boxes[:3]]}, center_y={[b['center_y'] for b in detected_char_boxes[:3]]}")
-
- text_no_space = combined_text_for_detection.replace(' ', '')
-
- if len(detected_char_boxes) > 0 and len(detected_char_boxes) == len(text_no_space):
- # 使用OpenCV检测到的精确位置
- # 关键:OpenCV检测的字符框顺序可能与OCR文本顺序不一致
- # 需要根据字符框的位置来匹配字符,而不是简单地按索引对应
-
- # 方法1:将字符框按位置排序(在OpenCV坐标系中:从上到下、从右到左)
- # 注意:detect_characters_with_opencv函数返回的字符框可能已经按某种顺序排列
- # 但我们需要确保按照正确的阅读顺序(从上到下、从右到左)排序
- sorted_char_boxes = sorted(detected_char_boxes, key=lambda b: (b['y1'], -b['center_x']))
-
- # 反转文本字符,使其与字符框的位置顺序对应
- reversed_text_chars = list(text_no_space[::-1])
-
- # 将排序后的字符框与反转后的文本字符对应
- for k, char_box in enumerate(sorted_char_boxes):
- char = reversed_text_chars[k] if k < len(reversed_text_chars) else '?'
- all_char_boxes_list.append({
- 'char': char,
- 'x1': char_box['x1'],
- 'y1': char_box['y1'],
- 'x2': char_box['x2'],
- 'y2': char_box['y2'],
- 'center_x': char_box['center_x'],
- 'center_y': char_box['center_y']
- })
- else:
- # 如果OpenCV检测失败,回退到估算方法
- if len(text_no_space) > 0:
- bbox_width = right - left
- bbox_height = bottom - top
- is_vertical = bbox_height > bbox_width * 1.2
-
- if is_vertical:
- # 竖排:字符从上到下(y坐标从小到大)
- char_height = bbox_height / len(text_no_space)
- for k, char in enumerate(text_no_space):
- char_x = char_bbox['center_x']
- char_y = char_bbox['y1'] + char_height * (k + 0.5)
- all_char_boxes_list.append({
- 'char': char,
- 'x1': char_x - 5,
- 'y1': char_y - char_height/2,
- 'x2': char_x + 5,
- 'y2': char_y + char_height/2,
- 'center_x': char_x,
- 'center_y': char_y
- })
- else:
- # 横排:字符从左到右(估算)
- char_width = bbox_width / len(text_no_space)
- for k, char in enumerate(text_no_space):
- char_x = char_bbox['x1'] + char_width * (k + 0.5)
- char_y = char_bbox['center_y']
- all_char_boxes_list.append({
- 'char': char,
- 'x1': char_x - char_width/2,
- 'y1': char_y - 5,
- 'x2': char_x + char_width/2,
- 'y2': char_y + 5,
- 'center_x': char_x,
- 'center_y': char_y
- })
-
- # 合并所有文字
- if all_texts:
- # 先对文字行进行排序(从右到左、从上到下)
- # 注意:对于日式漫画,阅读顺序是从右到左、从上到下
- # 排序规则:先按Y坐标从上到下(y1越小越靠上),然后按X坐标从右到左(center_x越大越靠右)
- if len(text_lines_with_bbox) > 1:
- text_lines_with_bbox.sort(key=lambda line: (line['bbox']['y1'], -line['bbox']['center_x']))
-
- # 从排序后的text_lines_with_bbox中提取文本
- text_lines = [line['text'] for line in text_lines_with_bbox]
- combined_text = ' '.join(text_lines)
- avg_confidence = sum([t[1] for t in all_texts]) / len(all_texts) if all_texts else 0.0
-
- # 使用字符位置信息
- character_positions = []
- if all_char_boxes_list and len(all_char_boxes_list) > 0:
- # 获取图片高度(用于坐标转换)
- img_height = img.shape[0]
-
- # 注意:字符框已经在前面按位置排序并与文本字符对应了
- # 这里不需要再次排序,保持字符与坐标的对应关系
- # 直接使用all_char_boxes_list,保持字符与坐标的对应关系
- for char_box in all_char_boxes_list:
- # 将坐标转换为数学坐标系(左下角为原点,向上为y轴正方向)
- # 转换中心坐标
- center_x_old = char_box['center_x']
- center_y_old = char_box['center_y']
- center_x_new, center_y_new = convert_coordinate_to_math_system(center_x_old, center_y_old, img_height)
-
- # 转换边界框坐标(用于更精确的位置信息)
- x1_old = char_box['x1']
- y1_old = char_box['y1']
- x2_old = char_box['x2']
- y2_old = char_box['y2']
-
- x1_new, y1_new = convert_coordinate_to_math_system(x1_old, y1_old, img_height)
- x2_new, y2_new = convert_coordinate_to_math_system(x2_old, y2_old, img_height)
-
- # 注意:在数学坐标系中,y1_new > y2_new(因为y1在原图中更靠上,转换后y值更大)
- # 所以需要确保y1是上边界(y值更大),y2是下边界(y值更小)
- y1_math = max(y1_new, y2_new) # 上边界(y值更大)
- y2_math = min(y1_new, y2_new) # 下边界(y值更小)
-
- character_positions.append({
- 'x': center_x_new, # 转换后的中心x坐标(数学坐标系)
- 'y': center_y_new, # 转换后的中心y坐标(数学坐标系)
- 'center_x': center_x_new, # 转换后的中心x坐标
- 'center_y': center_y_new, # 转换后的中心y坐标
- 'x1': min(x1_new, x2_new), # 转换后的左边界x坐标
- 'y1': y1_math, # 转换后的上边界y坐标(数学坐标系中y值更大)
- 'x2': max(x1_new, x2_new), # 转换后的右边界x坐标
- 'y2': y2_math, # 转换后的下边界y坐标(数学坐标系中y值更小)
- 'x_old': center_x_old, # 保留原始中心x坐标(用于调试)
- 'y_old': center_y_old # 保留原始中心y坐标(用于调试)
- })
-
- # 如果字符位置数量不匹配,清空
- text_no_space_for_check = combined_text.replace(' ', '')
- if len(character_positions) != len(text_no_space_for_check):
- if '远道' in combined_text or '石田' in combined_text:
- print(f" [DEBUG] 字符位置数量不匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}, text=\"{combined_text}\"")
- character_positions = []
- elif '远道' in combined_text or '石田' in combined_text:
- print(f" [DEBUG] 字符位置数量匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}")
- print(f" [DEBUG] 前3个character_positions: {[{'x': p.get('center_x', p.get('x', 0)), 'y': p.get('center_y', p.get('y', 0))} for p in character_positions[:3]]}")
-
- # 调试输出:检查character_positions
- if ('远道' in combined_text or '石田' in combined_text) and character_positions:
- print(f" [DEBUG] 保存到dialogues: text=\"{combined_text}\", character_positions数量={len(character_positions)}")
-
- if combined_text and combined_text.strip():
- dialogues.append({
- 'order': i + 1,
- 'text': combined_text,
- 'bbox': text_block['bbox'],
- 'confidence': avg_confidence,
- 'character_positions': character_positions if character_positions else None
- })
- text_preview = combined_text[:30] + '...' if len(combined_text) > 30 else combined_text
- print(f" [{i+1}/{len(blk_list)}] 识别: {text_preview} (置信度: {avg_confidence:.2f})")
- else:
- print(f" [DEBUG] 第 {i+1} 个区域: combined_text为空 (all_texts长度: {len(all_texts)})")
- else:
- print(f" [DEBUG] 第 {i+1} 个区域未识别到文字 (all_texts为空)")
- except Exception as e:
- print(f" [WARN] PaddleOCR解析第 {i+1} 个区域结果失败: {e}")
- import traceback
- traceback.print_exc()
- continue
- except Exception as e:
- print(f" [WARN] PaddleOCR识别第 {i+1} 个区域失败: {e}")
- import traceback
- traceback.print_exc()
- continue
- else:
- # 只使用PaddleOCR,如果失败则报错
- raise RuntimeError(f"OCR引擎不是PaddleOCR,当前引擎: {ocr_engine}")
- except Exception as e:
- print(f" [WARN] 识别第 {i+1} 个区域失败: {e}")
- import traceback
- traceback.print_exc()
- continue
-
- print(f"[OK] 成功识别 {len(dialogues)} 段文字")
-
- # 步骤6: 按格子位置排序(越往上、越往右的格子里的对话顺序越靠前)
- print("[INFO] 步骤6: 按格子位置排序...")
- sorted_dialogues = sort_text_blocks_by_panels(dialogues, panels, im_w, im_h)
-
- # 重新分配order,保留order、text、bbox和character_positions字段
- formatted_dialogues = []
- for i, dialogue in enumerate(sorted_dialogues, 1):
- formatted_dialogues.append({
- 'order': i,
- 'text': dialogue['text'],
- 'bbox': dialogue.get('bbox', {}), # 保留bbox信息用于排序
- 'character_positions': dialogue.get('character_positions') # 保留字符位置信息用于字符排序
- })
-
- # 步骤7: 保存JSON结果到output_dir(ocr目录,最终结果)
- print("[INFO] 步骤7: 保存JSON结果...")
- result = {
- 'image_file': image_path.name,
- 'reading_order': '从右到左、从上到下(日式漫画阅读顺序)',
- 'dialogues': formatted_dialogues,
- 'total_count': len(formatted_dialogues)
- }
-
- # 保存JSON到output_dir(ocr目录,最终结果文件)
- json_path = output_dir / f"{image_name}_dialogues.json"
- with open(json_path, 'w', encoding='utf-8') as f:
- json.dump(result, f, ensure_ascii=False, indent=2)
- print(f"[OK] 已保存对白结果: {json_path}")
-
- return result
- def batch_detect_and_ocr(image_dir, model_path=None, output_dir=None):
- """
- 批量处理目录下所有图片
- """
- image_dir = Path(image_dir)
-
- if not image_dir.exists():
- raise FileNotFoundError(f"图片目录不存在: {image_dir}")
-
- # 获取所有图片文件,按文件名数字排序
- image_files = []
- for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.webp']:
- image_files.extend(image_dir.glob(ext))
- image_files.extend(image_dir.glob(ext.upper()))
-
- # 按文件名开头的数字排序
- image_files = sorted(image_files, key=lambda x: int(x.stem.split('_')[0]) if x.stem.split('_')[0].isdigit() else 0)
-
- print(f"[INFO] 找到 {len(image_files)} 张图片")
-
- # 设置输出目录
- if output_dir is None:
- output_dir = image_dir / 'ocr'
- else:
- output_dir = Path(output_dir)
-
- output_dir.mkdir(parents=True, exist_ok=True)
-
- results = []
- for i, image_file in enumerate(image_files, 1):
- print(f"\n[{i}/{len(image_files)}] 处理: {image_file.name}")
- try:
- result = detect_and_ocr_comic(image_file, model_path, output_dir)
- results.append(result)
- except Exception as e:
- print(f"[ERROR] 处理 {image_file.name} 失败: {e}")
- import traceback
- traceback.print_exc()
- continue
-
- print(f"\n[OK] 批量处理完成,成功处理 {len(results)} 张图片")
- return results
- if __name__ == '__main__':
- import argparse
-
- parser = argparse.ArgumentParser(description='检测漫画文字区域并用OCR识别')
- parser.add_argument('input', help='输入图片路径或目录')
- parser.add_argument('-o', '--output', help='输出目录')
- parser.add_argument('-m', '--model', help='comic-text-detector模型路径')
-
- args = parser.parse_args()
-
- input_path = Path(args.input)
-
- if input_path.is_file():
- detect_and_ocr_comic(input_path, args.model, args.output)
- elif input_path.is_dir():
- batch_detect_and_ocr(input_path, args.model, args.output)
- else:
- print(f"[ERROR] 输入路径不存在: {input_path}")
- sys.exit(1)
|