detect_and_ocr_comic.py 65 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415
  1. # -*- coding: utf-8 -*-
  2. """
  3. 使用 comic-text-detector 检测文字区域,然后用 PaddleOCR 识别文字内容
  4. 按日式漫画阅读顺序(从右到左、从上到下)排序
  5. """
  6. import sys
  7. import os
  8. import json
  9. from pathlib import Path
  10. import cv2
  11. import numpy as np
  12. def convert_coordinate_to_math_system(x, y, image_height):
  13. """
  14. 将OpenCV坐标系转换为数学坐标系
  15. 原始坐标系(OpenCV):左上角(0,0),向下为y轴正方向,向右为x轴正方向
  16. 目标坐标系(数学):左下角(0,0),向上为y轴正方向,向右为x轴正方向
  17. 转换公式:
  18. - x_new = x_old (x坐标不变)
  19. - y_new = imageHeight - y_old (y坐标翻转)
  20. 参数:
  21. x: 原始x坐标
  22. y: 原始y坐标
  23. image_height: 图片高度
  24. 返回:
  25. (x_new, y_new): 转换后的坐标
  26. """
  27. x_new = x
  28. y_new = image_height - y
  29. return x_new, y_new
  30. def detect_characters_with_opencv(img, text_bbox, text_content, ocr_bbox_hint=None):
  31. """
  32. 使用OpenCV在文本区域内精确定位每个字符
  33. 参数:
  34. img: 原始图像(BGR格式)
  35. text_bbox: 文本边界框,格式 {'x1': int, 'y1': int, 'x2': int, 'y2': int}
  36. text_content: 文本内容(用于验证字符数量)
  37. ocr_bbox_hint: OCR提供的文本边界框(可选,用于辅助识别)
  38. 返回:
  39. char_boxes: 字符边界框列表,每个元素包含 {'x1', 'y1', 'x2', 'y2', 'center_x', 'center_y'}
  40. 确保字符数量与OCR文本一致,且字符框不重叠
  41. """
  42. # 提取文本区域
  43. x1 = int(text_bbox['x1'])
  44. y1 = int(text_bbox['y1'])
  45. x2 = int(text_bbox['x2'])
  46. y2 = int(text_bbox['y2'])
  47. # 确保坐标在图像范围内
  48. h, w = img.shape[:2]
  49. x1 = max(0, x1)
  50. y1 = max(0, y1)
  51. x2 = min(w, x2)
  52. y2 = min(h, y2)
  53. if x2 <= x1 or y2 <= y1:
  54. return []
  55. # 提取文本区域ROI
  56. text_roi = img[y1:y2, x1:x2].copy()
  57. if text_roi.size == 0:
  58. return []
  59. # 转换为灰度图
  60. if len(text_roi.shape) == 3:
  61. gray_roi = cv2.cvtColor(text_roi, cv2.COLOR_BGR2GRAY)
  62. else:
  63. gray_roi = text_roi
  64. # 二值化处理
  65. # 使用自适应阈值,因为文本区域可能有不同的光照条件
  66. binary = cv2.adaptiveThreshold(
  67. gray_roi, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
  68. cv2.THRESH_BINARY_INV, 11, 2
  69. )
  70. # 形态学操作:去除噪点,连接字符笔画
  71. kernel = np.ones((2, 2), np.uint8)
  72. binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
  73. binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel, iterations=1)
  74. # 判断是竖排还是横排
  75. roi_height, roi_width = binary.shape
  76. is_vertical = roi_height > roi_width * 1.2
  77. # 保存布局信息,用于后续估算
  78. layout_info = {'is_vertical': is_vertical, 'roi_width': roi_width, 'roi_height': roi_height}
  79. char_boxes = []
  80. if is_vertical:
  81. # 竖排文字:使用垂直投影来分割字符
  82. # 计算垂直投影(每列的白色像素数量)
  83. vertical_projection = np.sum(binary, axis=0)
  84. # 找到字符之间的空白列(投影值接近0)
  85. threshold = np.max(vertical_projection) * 0.1
  86. char_boundaries = []
  87. in_char = False
  88. start_col = 0
  89. for col in range(len(vertical_projection)):
  90. if vertical_projection[col] > threshold:
  91. if not in_char:
  92. in_char = True
  93. start_col = col
  94. else:
  95. if in_char:
  96. in_char = False
  97. # 字符结束位置(使用中间位置作为分割点)
  98. end_col = col
  99. char_boundaries.append((start_col, end_col))
  100. # 处理最后一个字符
  101. if in_char:
  102. char_boundaries.append((start_col, len(vertical_projection)))
  103. # 为每个字符区域计算水平边界
  104. for start_col, end_col in char_boundaries:
  105. char_col_roi = binary[:, start_col:end_col]
  106. horizontal_projection = np.sum(char_col_roi, axis=1)
  107. # 找到字符的上下边界
  108. char_rows = np.where(horizontal_projection > 0)[0]
  109. if len(char_rows) > 0:
  110. top_row = char_rows[0]
  111. bottom_row = char_rows[-1]
  112. # 转换为原图坐标
  113. char_x1 = x1 + start_col
  114. char_y1 = y1 + top_row
  115. char_x2 = x1 + end_col
  116. char_y2 = y1 + bottom_row
  117. char_boxes.append({
  118. 'x1': float(char_x1),
  119. 'y1': float(char_y1),
  120. 'x2': float(char_x2),
  121. 'y2': float(char_y2),
  122. 'center_x': float((char_x1 + char_x2) / 2),
  123. 'center_y': float((char_y1 + char_y2) / 2)
  124. })
  125. else:
  126. # 横排文字:使用水平投影来分割字符
  127. # 计算水平投影(每行的白色像素数量)
  128. horizontal_projection = np.sum(binary, axis=1)
  129. # 找到字符之间的空白行(投影值接近0)
  130. threshold = np.max(horizontal_projection) * 0.1
  131. char_boundaries = []
  132. in_char = False
  133. start_row = 0
  134. for row in range(len(horizontal_projection)):
  135. if horizontal_projection[row] > threshold:
  136. if not in_char:
  137. in_char = True
  138. start_row = row
  139. else:
  140. if in_char:
  141. in_char = False
  142. # 字符结束位置
  143. end_row = row
  144. char_boundaries.append((start_row, end_row))
  145. # 处理最后一个字符
  146. if in_char:
  147. char_boundaries.append((start_row, len(horizontal_projection)))
  148. # 为每个字符区域计算垂直边界
  149. for start_row, end_row in char_boundaries:
  150. char_row_roi = binary[start_row:end_row, :]
  151. vertical_projection = np.sum(char_row_roi, axis=0)
  152. # 找到字符的左右边界
  153. char_cols = np.where(vertical_projection > 0)[0]
  154. if len(char_cols) > 0:
  155. left_col = char_cols[0]
  156. right_col = char_cols[-1]
  157. # 转换为原图坐标
  158. char_x1 = x1 + left_col
  159. char_y1 = y1 + start_row
  160. char_x2 = x1 + right_col
  161. char_y2 = y1 + end_row
  162. char_boxes.append({
  163. 'x1': float(char_x1),
  164. 'y1': float(char_y1),
  165. 'x2': float(char_x2),
  166. 'y2': float(char_y2),
  167. 'center_x': float((char_x1 + char_x2) / 2),
  168. 'center_y': float((char_y1 + char_y2) / 2)
  169. })
  170. # 如果投影方法检测到的字符数量与文本内容不匹配,使用轮廓检测作为主要方法
  171. text_no_space = text_content.replace(' ', '')
  172. expected_char_count = len(text_no_space)
  173. # 使用轮廓检测作为主要方法(更精确)
  174. contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  175. # 过滤掉太小的轮廓(可能是噪点)
  176. min_area = (roi_width * roi_height) / (expected_char_count * 20) # 更严格的面积阈值
  177. max_area = (roi_width * roi_height) / 2 # 最大面积(避免误检测)
  178. valid_contours = []
  179. for contour in contours:
  180. area = cv2.contourArea(contour)
  181. if min_area < area < max_area:
  182. # 计算轮廓的宽高比,过滤掉明显不是字符的轮廓
  183. x, y, w, h = cv2.boundingRect(contour)
  184. aspect_ratio = h / w if w > 0 else 0
  185. # 字符的宽高比通常在合理范围内
  186. if 0.2 < aspect_ratio < 5.0:
  187. valid_contours.append(contour)
  188. # 如果轮廓数量合理,使用轮廓结果(更精确)
  189. if len(valid_contours) > 0:
  190. # 按位置排序轮廓(从上到下、从右到左)
  191. contour_boxes = []
  192. for contour in valid_contours:
  193. x, y, w, h = cv2.boundingRect(contour)
  194. # 转换为原图坐标
  195. char_x1 = float(x1 + x)
  196. char_y1 = float(y1 + y)
  197. char_x2 = float(x1 + x + w)
  198. char_y2 = float(y1 + y + h)
  199. contour_boxes.append({
  200. 'x1': char_x1,
  201. 'y1': char_y1,
  202. 'x2': char_x2,
  203. 'y2': char_y2,
  204. 'center_x': float(char_x1 + w / 2),
  205. 'center_y': float(char_y1 + h / 2),
  206. 'area': area
  207. })
  208. # 按位置排序(从上到下、从右到左)
  209. contour_boxes.sort(key=lambda b: (b['y1'], -b['center_x']))
  210. # 如果轮廓数量接近预期,使用轮廓结果
  211. if abs(len(contour_boxes) - expected_char_count) <= abs(len(char_boxes) - expected_char_count):
  212. char_boxes = contour_boxes
  213. # 关键改进:确保字符数量与OCR文本一致,且字符框不重叠
  214. # 如果识别出的字符数量不匹配,使用OCR坐标作为参考来辅助识别
  215. if len(char_boxes) != expected_char_count and ocr_bbox_hint:
  216. # 使用OCR提供的边界框作为参考,估算字符位置
  217. char_boxes = refine_char_boxes_with_ocr_hint(
  218. img, text_bbox, text_content, char_boxes, ocr_bbox_hint, expected_char_count
  219. )
  220. # 确保字符框不重叠
  221. char_boxes = remove_overlapping_boxes(char_boxes, expected_char_count)
  222. # 如果字符数量仍然不匹配,使用估算方法
  223. if len(char_boxes) != expected_char_count:
  224. char_boxes = estimate_char_boxes_from_text_bbox(
  225. text_bbox, text_content, expected_char_count, is_vertical
  226. )
  227. return char_boxes
  228. def refine_char_boxes_with_ocr_hint(img, text_bbox, text_content, detected_boxes, ocr_bbox_hint, expected_count):
  229. """
  230. 使用OCR提供的边界框作为参考,改进字符检测
  231. 参数:
  232. img: 原始图像
  233. text_bbox: 文本边界框
  234. text_content: 文本内容
  235. detected_boxes: 已检测到的字符框列表
  236. ocr_bbox_hint: OCR提供的文本边界框
  237. expected_count: 期望的字符数量
  238. 返回:
  239. 改进后的字符框列表
  240. """
  241. # 如果已检测到的字符框数量接近期望值,直接返回
  242. if abs(len(detected_boxes) - expected_count) <= 2:
  243. return detected_boxes
  244. # 使用OCR边界框估算字符位置
  245. text_no_space = text_content.replace(' ', '')
  246. roi_width = text_bbox['x2'] - text_bbox['x1']
  247. roi_height = text_bbox['y2'] - text_bbox['y1']
  248. is_vertical = roi_height > roi_width * 1.2
  249. estimated_boxes = []
  250. if is_vertical:
  251. # 竖排:估算每个字符的位置
  252. # 估算列数和行数
  253. estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8)))
  254. estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols
  255. char_width = roi_width / estimated_cols
  256. char_height = roi_height / estimated_rows
  257. # 如果有已检测到的字符框,使用它们的位置来调整估算
  258. if len(detected_boxes) > 0:
  259. # 使用已检测到的字符框位置来调整估算
  260. for i in range(expected_count):
  261. col = i % estimated_cols
  262. row = i // estimated_cols
  263. est_x = text_bbox['x1'] + col * char_width + char_width / 2
  264. est_y = text_bbox['y1'] + row * char_height + char_height / 2
  265. # 找到最近的已检测字符框
  266. min_dist = float('inf')
  267. best_box = None
  268. for box in detected_boxes:
  269. dist = abs(box['center_x'] - est_x) + abs(box['center_y'] - est_y)
  270. if dist < min_dist:
  271. min_dist = dist
  272. best_box = box
  273. if best_box and min_dist < char_width:
  274. # 使用已检测到的字符框
  275. estimated_boxes.append(best_box)
  276. else:
  277. # 使用估算位置
  278. estimated_boxes.append({
  279. 'x1': float(est_x - char_width / 2),
  280. 'y1': float(est_y - char_height / 2),
  281. 'x2': float(est_x + char_width / 2),
  282. 'y2': float(est_y + char_height / 2),
  283. 'center_x': float(est_x),
  284. 'center_y': float(est_y)
  285. })
  286. else:
  287. # 完全使用估算
  288. for i in range(expected_count):
  289. col = i % estimated_cols
  290. row = i // estimated_cols
  291. est_x = text_bbox['x1'] + col * char_width + char_width / 2
  292. est_y = text_bbox['y1'] + row * char_height + char_height / 2
  293. estimated_boxes.append({
  294. 'x1': float(est_x - char_width / 2),
  295. 'y1': float(est_y - char_height / 2),
  296. 'x2': float(est_x + char_width / 2),
  297. 'y2': float(est_y + char_height / 2),
  298. 'center_x': float(est_x),
  299. 'center_y': float(est_y)
  300. })
  301. else:
  302. # 横排:估算每个字符的位置
  303. char_width = roi_width / expected_count
  304. char_height = roi_height
  305. for i in range(expected_count):
  306. x = text_bbox['x1'] + i * char_width + char_width / 2
  307. y = text_bbox['y1'] + roi_height / 2
  308. estimated_boxes.append({
  309. 'x1': float(x - char_width / 2),
  310. 'y1': float(y - char_height / 2),
  311. 'x2': float(x + char_width / 2),
  312. 'y2': float(y + char_height / 2),
  313. 'center_x': float(x),
  314. 'center_y': float(y)
  315. })
  316. return estimated_boxes[:expected_count]
  317. def remove_overlapping_boxes(char_boxes, expected_count):
  318. """
  319. 移除重叠的字符框,确保字符框不重叠
  320. 参数:
  321. char_boxes: 字符框列表
  322. expected_count: 期望的字符数量
  323. 返回:
  324. 去重后的字符框列表
  325. """
  326. if len(char_boxes) <= expected_count:
  327. return char_boxes
  328. # 按位置排序
  329. sorted_boxes = sorted(char_boxes, key=lambda b: (b['y1'], b['center_x']))
  330. # 移除重叠的字符框
  331. non_overlapping = []
  332. for box in sorted_boxes:
  333. is_overlapping = False
  334. for existing_box in non_overlapping:
  335. # 计算重叠面积
  336. overlap_x1 = max(box['x1'], existing_box['x1'])
  337. overlap_y1 = max(box['y1'], existing_box['y1'])
  338. overlap_x2 = min(box['x2'], existing_box['x2'])
  339. overlap_y2 = min(box['y2'], existing_box['y2'])
  340. if overlap_x2 > overlap_x1 and overlap_y2 > overlap_y1:
  341. overlap_area = (overlap_x2 - overlap_x1) * (overlap_y2 - overlap_y1)
  342. box_area = (box['x2'] - box['x1']) * (box['y2'] - box['y1'])
  343. existing_area = (existing_box['x2'] - existing_box['x1']) * (existing_box['y2'] - existing_box['y1'])
  344. # 如果重叠面积超过较小框的50%,认为是重叠
  345. if overlap_area > min(box_area, existing_area) * 0.5:
  346. is_overlapping = True
  347. break
  348. if not is_overlapping:
  349. non_overlapping.append(box)
  350. # 如果去重后数量不足,尝试合并相近的字符框
  351. if len(non_overlapping) < expected_count:
  352. # 按位置分组,合并相近的字符框
  353. grouped = []
  354. for box in sorted_boxes:
  355. added = False
  356. for group in grouped:
  357. # 检查是否与组内任何框相近
  358. for group_box in group:
  359. dist = abs(box['center_x'] - group_box['center_x']) + abs(box['center_y'] - group_box['center_y'])
  360. if dist < 20: # 如果距离小于20像素,认为是同一个字符
  361. group.append(box)
  362. added = True
  363. break
  364. if added:
  365. break
  366. if not added:
  367. grouped.append([box])
  368. # 对每个组,选择最大的字符框
  369. non_overlapping = []
  370. for group in grouped:
  371. largest = max(group, key=lambda b: (b['x2'] - b['x1']) * (b['y2'] - b['y1']))
  372. non_overlapping.append(largest)
  373. return non_overlapping[:expected_count]
  374. def estimate_char_boxes_from_text_bbox(text_bbox, text_content, expected_count, is_vertical):
  375. """
  376. 从文本边界框估算字符位置(当OpenCV检测失败时使用)
  377. 参数:
  378. text_bbox: 文本边界框
  379. text_content: 文本内容
  380. expected_count: 期望的字符数量
  381. is_vertical: 是否为竖排
  382. 返回:
  383. 估算的字符框列表
  384. """
  385. text_no_space = text_content.replace(' ', '')
  386. roi_width = text_bbox['x2'] - text_bbox['x1']
  387. roi_height = text_bbox['y2'] - text_bbox['y1']
  388. estimated_boxes = []
  389. if is_vertical:
  390. # 竖排:估算每个字符的位置
  391. # 估算列数和行数
  392. estimated_cols = max(1, int(roi_width / (roi_height / expected_count * 0.8)))
  393. estimated_rows = (expected_count + estimated_cols - 1) // estimated_cols
  394. char_width = roi_width / estimated_cols
  395. char_height = roi_height / estimated_rows
  396. for i in range(expected_count):
  397. col = i % estimated_cols
  398. row = i // estimated_cols
  399. x = text_bbox['x1'] + col * char_width + char_width / 2
  400. y = text_bbox['y1'] + row * char_height + char_height / 2
  401. estimated_boxes.append({
  402. 'x1': float(x - char_width / 2),
  403. 'y1': float(y - char_height / 2),
  404. 'x2': float(x + char_width / 2),
  405. 'y2': float(y + char_height / 2),
  406. 'center_x': float(x),
  407. 'center_y': float(y)
  408. })
  409. else:
  410. # 横排:估算每个字符的位置
  411. char_width = roi_width / expected_count
  412. char_height = roi_height
  413. for i in range(expected_count):
  414. x = text_bbox['x1'] + i * char_width + char_width / 2
  415. y = text_bbox['y1'] + roi_height / 2
  416. estimated_boxes.append({
  417. 'x1': float(x - char_width / 2),
  418. 'y1': float(y - char_height / 2),
  419. 'x2': float(x + char_width / 2),
  420. 'y2': float(y + char_height / 2),
  421. 'center_x': float(x),
  422. 'center_y': float(y)
  423. })
  424. return estimated_boxes
  425. # 禁用 oneDNN 以避免 NotImplementedError(PaddlePaddle 3.3.0 的已知问题)
  426. os.environ['FLAGS_onednn'] = '0'
  427. os.environ['FLAGS_use_mkldnn'] = '0'
  428. # Windows编码修复
  429. if sys.platform == 'win32':
  430. import io
  431. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
  432. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
  433. # 添加comic-text-detector路径
  434. project_root = Path(__file__).parent.parent
  435. comic_detector_path = project_root / 'comic-text-detector-master' / 'comic-text-detector-master'
  436. sys.path.insert(0, str(comic_detector_path))
  437. # 添加OnnxOCR本地路径(作为回退选项)
  438. onnxocr_path = project_root / 'OnnxOCR-main' / 'OnnxOCR-main'
  439. if onnxocr_path.exists():
  440. sys.path.insert(0, str(onnxocr_path))
  441. # 处理 wandb 可选依赖(comic-text-detector 需要但推理时不需要)
  442. try:
  443. import wandb
  444. except ImportError:
  445. # 创建一个假的 wandb 模块,避免导入错误
  446. class FakeWandb:
  447. @staticmethod
  448. def init(*args, **kwargs):
  449. return None
  450. @staticmethod
  451. def log(*args, **kwargs):
  452. pass
  453. @staticmethod
  454. def log_model(*args, **kwargs):
  455. pass
  456. sys.modules['wandb'] = FakeWandb()
  457. try:
  458. from inference import TextDetector, REFINEMASK_ANNOTATION
  459. from utils.io_utils import imread, imwrite
  460. except ImportError as e:
  461. print(f"[ERROR] 无法导入comic-text-detector模块: {e}")
  462. print(f"[INFO] 请确保已安装依赖: pip install torch torchvision opencv-python numpy tqdm")
  463. import traceback
  464. traceback.print_exc()
  465. sys.exit(1)
  466. # PaddleOCR(唯一使用)
  467. try:
  468. # 添加PaddleOCR路径
  469. paddleocr_path = project_root / 'PaddleOCR-main' / 'PaddleOCR-main'
  470. if paddleocr_path.exists():
  471. sys.path.insert(0, str(paddleocr_path))
  472. from paddleocr import PaddleOCR
  473. PADDLEOCR_AVAILABLE = True
  474. print("[INFO] PaddleOCR 可用")
  475. except ImportError as e:
  476. print(f"[ERROR] 无法导入PaddleOCR模块: {e}")
  477. print("[ERROR] PaddleOCR 是必需的,请确保已正确安装")
  478. PADDLEOCR_AVAILABLE = False
  479. # 格子识别代码已移动到 python/generate-anim/detect_panels.py
  480. # 通过导入使用
  481. try:
  482. # 添加当前目录到路径,以便导入同目录下的模块
  483. import sys
  484. current_dir = Path(__file__).parent
  485. if str(current_dir) not in sys.path:
  486. sys.path.insert(0, str(current_dir))
  487. from detect_panels import detect_comic_panels, merge_panel_mask_with_text_mask
  488. except ImportError as e:
  489. print(f"[WARN] 无法导入detect_panels模块,使用本地实现: {e}")
  490. # 如果导入失败,使用本地实现(向后兼容)
  491. def detect_comic_panels(img):
  492. """使用opencv检测漫画格子(分镜框)- 本地实现"""
  493. if len(img.shape) == 3:
  494. gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  495. else:
  496. gray = img.copy()
  497. panel_mask = np.zeros_like(gray)
  498. edges = cv2.Canny(gray, 50, 150, apertureSize=3)
  499. horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
  500. horizontal_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, horizontal_kernel)
  501. vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
  502. vertical_lines = cv2.morphologyEx(edges, cv2.MORPH_OPEN, vertical_kernel)
  503. lines_mask = cv2.bitwise_or(horizontal_lines, vertical_lines)
  504. lines = cv2.HoughLinesP(lines_mask, 1, np.pi/180, threshold=100,
  505. minLineLength=50, maxLineGap=10)
  506. if lines is not None:
  507. for line in lines:
  508. x1, y1, x2, y2 = line[0]
  509. cv2.line(panel_mask, (x1, y1), (x2, y2), 255, 2)
  510. kernel = np.ones((3, 3), np.uint8)
  511. dilated = cv2.dilate(lines_mask, kernel, iterations=2)
  512. contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  513. panels = []
  514. for contour in contours:
  515. x, y, w, h = cv2.boundingRect(contour)
  516. area = w * h
  517. if area > img.shape[0] * img.shape[1] * 0.01:
  518. cv2.rectangle(panel_mask, (x, y), (x + w, y + h), 255, 2)
  519. panels.append({
  520. 'x': x,
  521. 'y': y,
  522. 'width': w,
  523. 'height': h,
  524. 'center_x': x + w / 2,
  525. 'center_y': y + h / 2
  526. })
  527. return panel_mask, panels
  528. def merge_panel_mask_with_text_mask(panel_mask, text_mask):
  529. """合并格子遮罩图和文字mask图"""
  530. if panel_mask.shape != text_mask.shape:
  531. panel_mask = cv2.resize(panel_mask, (text_mask.shape[1], text_mask.shape[0]))
  532. return np.maximum(panel_mask, text_mask)
  533. def get_text_block_panel(text_block, panels):
  534. """
  535. 判断文字块属于哪个格子
  536. 参数:
  537. text_block: 文字块,包含bbox信息
  538. panels: 格子列表
  539. 返回:
  540. panel_index: 格子索引,如果不在任何格子内返回-1
  541. """
  542. bbox = text_block['bbox']
  543. center_x = (bbox['x1'] + bbox['x2']) / 2
  544. center_y = (bbox['y1'] + bbox['y2']) / 2
  545. for i, panel in enumerate(panels):
  546. if (panel['x'] <= center_x <= panel['x'] + panel['width'] and
  547. panel['y'] <= center_y <= panel['y'] + panel['height']):
  548. return i
  549. return -1
  550. def sort_text_blocks_by_panels(text_blocks, panels, image_width, image_height):
  551. """
  552. 按日式漫画阅读顺序排序:从右到左、从上到下(竖着读取)
  553. 排序规则:
  554. 1. 先按列分组(从右到左)- 越往右的列越靠前
  555. 2. 同一列内,按行排序(从上到下)- 越往上的行越靠前
  556. 3. 同一格子内,按X坐标从右到左
  557. 参数:
  558. text_blocks: 文字块列表,每个包含bbox信息
  559. panels: 格子列表
  560. image_width: 图片宽度
  561. image_height: 图片高度
  562. 返回:
  563. 排序后的文字块列表
  564. """
  565. if not text_blocks:
  566. return []
  567. # 计算每个文字块的中心点和所属格子
  568. for block in text_blocks:
  569. bbox = block['bbox']
  570. block['center_x'] = (bbox['x1'] + bbox['x2']) / 2
  571. block['center_y'] = (bbox['y1'] + bbox['y2']) / 2
  572. block['panel_index'] = get_text_block_panel(block, panels)
  573. # 排序规则(日式漫画:从右到左、从上到下竖着读取):
  574. # 1. 先按X坐标分组(从右到左)- X坐标越大(越靠右)越靠前
  575. # 2. 同一列内,按Y坐标排序(从上到下)- Y坐标越小(越往上)越靠前
  576. # 3. 同一位置,按X坐标从右到左
  577. # 将图片分成列(从右到左)
  578. # 使用图片宽度的20%作为列的分组阈值(更宽松的分组)
  579. column_threshold = max(image_width * 0.2, 100) # 至少100像素
  580. def sort_key(block):
  581. # 直接使用文字块的中心坐标,不依赖格子
  582. center_x = block['center_x']
  583. center_y = block['center_y']
  584. # 计算列号(从右到左,列号越小越靠右)
  585. # 将X坐标转换为列号:X坐标越大,列号越小(越靠右)
  586. # 使用 image_width - center_x 来计算距离右边的距离
  587. distance_from_right = image_width - center_x
  588. column = int(distance_from_right / column_threshold)
  589. # 使用列号和Y坐标作为主要排序依据
  590. # 列号越小(越靠右)越靠前,Y坐标越小(越往上)越靠前
  591. # 同一列同一行内,X坐标越大(越靠右)越靠前
  592. return (column, center_y, -center_x)
  593. sorted_blocks = sorted(text_blocks, key=sort_key)
  594. return sorted_blocks
  595. def detect_and_ocr_comic(image_path, model_path=None, output_dir=None):
  596. """
  597. 检测漫画文字区域并用OCR识别
  598. 参数:
  599. image_path: 图片路径
  600. model_path: comic-text-detector模型路径
  601. output_dir: 输出目录
  602. """
  603. image_path = Path(image_path)
  604. if not image_path.exists():
  605. raise FileNotFoundError(f"图片文件不存在: {image_path}")
  606. print(f"📖 正在处理图片: {image_path.name}")
  607. # 设置模型路径
  608. if model_path is None:
  609. possible_paths = [
  610. comic_detector_path / 'data' / 'comictextdetector.pt',
  611. comic_detector_path / 'data' / 'comictextdetector.pt.onnx',
  612. ]
  613. model_path = None
  614. for path in possible_paths:
  615. if path.exists():
  616. model_path = path
  617. break
  618. if model_path is None:
  619. raise FileNotFoundError(
  620. f"未找到comic-text-detector模型文件。请下载模型并放到以下位置之一:\n" +
  621. "\n".join([f" - {p}" for p in possible_paths])
  622. )
  623. # 设置输出目录
  624. if output_dir is None:
  625. output_dir = image_path.parent
  626. else:
  627. output_dir = Path(output_dir)
  628. output_dir.mkdir(parents=True, exist_ok=True)
  629. # 创建tmp子目录用于保存中间处理文件
  630. tmp_dir = output_dir / 'tmp'
  631. tmp_dir.mkdir(parents=True, exist_ok=True)
  632. # 初始化comic-text-detector
  633. device = 'cuda' if __import__('torch').cuda.is_available() else 'cpu'
  634. print(f"[INFO] 使用设备: {device}")
  635. try:
  636. detector = TextDetector(
  637. model_path=str(model_path),
  638. input_size=1024,
  639. device=device,
  640. act='leaky'
  641. )
  642. except Exception as e:
  643. print(f"[ERROR] 初始化检测器失败: {e}")
  644. raise
  645. # 初始化PaddleOCR(唯一使用)
  646. print("[INFO] 初始化PaddleOCR...")
  647. ocr_engine = None
  648. paddleocr_instance = None
  649. if not PADDLEOCR_AVAILABLE:
  650. raise RuntimeError("PaddleOCR 不可用,请确保已正确安装 paddlex[ocr-core]")
  651. try:
  652. # 初始化PaddleOCR,使用中文模型
  653. # enable_mkldnn=False 禁用 MKL-DNN 以避免 NotImplementedError
  654. # use_angle_cls=True 启用角度分类器,可以更好地识别竖排文字
  655. paddleocr_instance = PaddleOCR(
  656. use_angle_cls=True, # 启用角度分类器,支持竖排文字识别
  657. lang='ch', # 中文
  658. enable_mkldnn=False # 禁用 MKL-DNN 以避免 oneDNN 错误
  659. )
  660. ocr_engine = 'paddleocr'
  661. print("[INFO] PaddleOCR 初始化成功")
  662. except Exception as e:
  663. print(f"[ERROR] PaddleOCR初始化失败: {e}")
  664. raise RuntimeError(f"PaddleOCR 初始化失败: {e}")
  665. # 读取图片
  666. img = imread(str(image_path))
  667. if img is None:
  668. raise ValueError(f"无法读取图片文件: {image_path}")
  669. im_h, im_w = img.shape[:2]
  670. print(f"[INFO] 图片尺寸: {im_w}x{im_h}")
  671. image_name = image_path.stem
  672. # 步骤1: 使用comic-text-detector检测文字区域(先检测文字块,用于辅助格子检测)
  673. print("[INFO] 步骤1: 检测文字区域...")
  674. try:
  675. mask, mask_refined, blk_list = detector(
  676. img,
  677. refine_mode=REFINEMASK_ANNOTATION,
  678. keep_undetected_mask=True
  679. )
  680. except Exception as e:
  681. print(f"[ERROR] 检测失败: {e}")
  682. raise
  683. print(f"[OK] 检测到 {len(blk_list)} 个文字区域")
  684. # 步骤2: 使用文字遮罩图和文字块信息辅助检测漫画格子
  685. print("[INFO] 步骤2: 检测漫画格子(使用文字遮罩图和文字块信息辅助)...")
  686. # 将文字块转换为统一格式
  687. text_blocks = []
  688. for blk in blk_list:
  689. x1, y1, x2, y2 = blk.xyxy
  690. text_blocks.append({
  691. 'xyxy': [int(x1), int(y1), int(x2), int(y2)]
  692. })
  693. # 使用文字遮罩图和文字块信息检测格子(优先使用文字遮罩图)
  694. panel_mask, panels = detect_comic_panels(img, text_blocks=text_blocks, text_mask=mask_refined)
  695. print(f"[OK] 检测到 {len(panels)} 个格子")
  696. # 如果检测到的格子太少,尝试不使用辅助信息重新检测
  697. if len(panels) < 4:
  698. print(f"[WARN] 检测到的格子数量较少({len(panels)}个),尝试使用传统方法重新检测...")
  699. panel_mask_fallback, panels_fallback = detect_comic_panels(img, text_blocks=None, text_mask=None)
  700. if len(panels_fallback) > len(panels):
  701. panel_mask = panel_mask_fallback
  702. panels = panels_fallback
  703. print(f"[OK] 使用传统方法检测到 {len(panels)} 个格子")
  704. # 保存格子遮罩图到tmp目录(中间文件)
  705. panel_mask_path = tmp_dir / f"{image_name}_panel_mask.png"
  706. imwrite(str(panel_mask_path), panel_mask)
  707. print(f"[OK] 已保存格子遮罩图: {panel_mask_path}")
  708. # 保存格子信息JSON到tmp目录(中间文件)
  709. panels_json = {
  710. 'image_file': image_path.name,
  711. 'panels': panels,
  712. 'total_count': len(panels)
  713. }
  714. panels_json_path = tmp_dir / f"{image_name}_panels.json"
  715. with open(panels_json_path, 'w', encoding='utf-8') as f:
  716. json.dump(panels_json, f, ensure_ascii=False, indent=2)
  717. print(f"[OK] 已保存格子信息: {panels_json_path}")
  718. # 保存原始文字遮罩图到tmp目录(中间文件)
  719. text_mask_path = tmp_dir / f"{image_name}_text_mask.png"
  720. imwrite(str(text_mask_path), mask_refined)
  721. print(f"[OK] 已保存文字遮罩图: {text_mask_path}")
  722. # 步骤3: 合并格子遮罩图和文字mask图
  723. print("[INFO] 步骤3: 合并格子遮罩图和文字mask图...")
  724. combined_mask = merge_panel_mask_with_text_mask(panel_mask, mask_refined)
  725. # 保存合并后的mask图片到tmp目录(中间文件)
  726. combined_mask_path = tmp_dir / f"{image_name}_combined_mask.png"
  727. print(f"[INFO] 步骤4: 保存合并后的mask图片到磁盘...")
  728. imwrite(str(combined_mask_path), combined_mask)
  729. print(f"[OK] 已保存合并后的mask图片: {combined_mask_path}")
  730. # 确认文件已生成
  731. if not text_mask_path.exists():
  732. raise FileNotFoundError(f"文字遮罩图文件未成功生成: {text_mask_path}")
  733. print(f"[OK] 已确认文字遮罩图文件存在")
  734. # 步骤5: 从保存的mask文件中读取,裁剪每个文字区域,然后识别
  735. print(f"[INFO] 步骤5: 从mask文件中读取并识别 {len(blk_list)} 个文字区域...")
  736. # 使用合并后的mask(已经在内存中,不需要重新读取)
  737. mask_img = combined_mask
  738. dialogues = []
  739. for i, blk in enumerate(blk_list):
  740. x1, y1, x2, y2 = blk.xyxy
  741. # 确保坐标在图片范围内
  742. x1 = max(0, int(x1))
  743. y1 = max(0, int(y1))
  744. x2 = min(im_w, int(x2))
  745. y2 = min(im_h, int(y2))
  746. # 从mask图片中裁剪对应的文字区域
  747. crop_mask = mask_img[y1:y2, x1:x2]
  748. if crop_mask.size == 0:
  749. continue
  750. # 同时从原图中裁剪对应的文字区域(用于OCR识别,效果更好)
  751. crop_img = img[y1:y2, x1:x2]
  752. # 确保是RGB格式(Tesseract可以直接使用,但统一使用RGB格式)
  753. if len(crop_img.shape) == 2:
  754. # 如果是灰度图,转换为RGB
  755. crop_img = cv2.cvtColor(crop_img, cv2.COLOR_GRAY2RGB)
  756. elif len(crop_img.shape) == 3 and crop_img.shape[2] == 4:
  757. # 如果是RGBA,转换为RGB
  758. crop_img = cv2.cvtColor(crop_img, cv2.COLOR_RGBA2RGB)
  759. # 对图片进行预处理以提高OCR识别率(保守处理,避免过度处理)
  760. # 1. 转换为灰度图
  761. if len(crop_img.shape) == 3:
  762. gray = cv2.cvtColor(crop_img, cv2.COLOR_RGB2GRAY)
  763. else:
  764. gray = crop_img
  765. # 2. 检测是否为黑底白字(黑白漫画)
  766. # 计算图片的平均亮度
  767. mean_brightness = np.mean(gray)
  768. is_dark_background = mean_brightness < 127 # 如果平均亮度小于127,可能是黑底
  769. # 如果是黑底白字,先反转颜色(OCR模型通常训练在白底黑字上)
  770. if is_dark_background:
  771. gray = cv2.bitwise_not(gray)
  772. # 3. 适度放大图片(仅对很小的文字区域)
  773. h, w = gray.shape[:2]
  774. if h < 32 or w < 32: # 只有很小的文字区域才放大
  775. scale = 2.0
  776. new_h, new_w = int(h * scale), int(w * scale)
  777. gray = cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
  778. # 4. 增强对比度(使用CLAHE,保守设置)
  779. clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
  780. enhanced = clahe.apply(gray)
  781. # 5. 轻度去噪处理(避免过度模糊)
  782. enhanced = cv2.fastNlMeansDenoising(enhanced, h=8, templateWindowSize=7, searchWindowSize=21)
  783. # 6. 转换回RGB格式(Tesseract可以直接使用灰度图,但RGB也可以)
  784. if len(enhanced.shape) == 2:
  785. crop_img_processed = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2RGB)
  786. else:
  787. crop_img_processed = enhanced
  788. text_block = {
  789. 'index': i + 1,
  790. 'bbox': {
  791. 'x1': x1,
  792. 'y1': y1,
  793. 'x2': x2,
  794. 'y2': y2,
  795. 'width': x2 - x1,
  796. 'height': y2 - y1,
  797. 'center_x': (x1 + x2) / 2,
  798. 'center_y': (y1 + y2) / 2
  799. }
  800. }
  801. try:
  802. if ocr_engine == 'paddleocr':
  803. # 使用PaddleOCR识别
  804. try:
  805. # PaddleOCR返回格式: [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], (text, confidence), ...]
  806. ocr_result = paddleocr_instance.ocr(crop_img_processed)
  807. # 调试:打印OCR结果格式
  808. if ocr_result:
  809. print(f" [DEBUG] 第 {i+1} 个区域: ocr_result类型={type(ocr_result)}, 长度={len(ocr_result) if isinstance(ocr_result, (list, tuple)) else 'N/A'}")
  810. if len(ocr_result) > 0:
  811. result_item = ocr_result[0]
  812. print(f" [DEBUG] 第 {i+1} 个区域: ocr_result[0]类型={type(result_item)}")
  813. # 检查OCRResult对象的属性
  814. if hasattr(result_item, '__dict__'):
  815. print(f" [DEBUG] 第 {i+1} 个区域: OCRResult属性={list(result_item.__dict__.keys())}")
  816. # 尝试转换为列表或字典
  817. try:
  818. if hasattr(result_item, 'text_lines') or hasattr(result_item, 'texts'):
  819. print(f" [DEBUG] 第 {i+1} 个区域: 尝试访问text_lines或texts属性")
  820. except:
  821. pass
  822. if ocr_result and len(ocr_result) > 0:
  823. # PaddleOCR 3.x 返回的是 OCRResult 对象
  824. result_item = ocr_result[0]
  825. # OCRResult 对象有 json 属性,返回字典格式
  826. # 结构: {'res': {'rec_texts': [...], 'rec_scores': [...], 'rec_polys': [...], 'rec_boxes': [...]}}
  827. try:
  828. result_json = result_item.json
  829. res_data = result_json.get('res', {}) if isinstance(result_json, dict) else {}
  830. # 提取文本、置信度、坐标
  831. rec_texts = res_data.get('rec_texts', [])
  832. rec_scores = res_data.get('rec_scores', [])
  833. rec_polys = res_data.get('rec_polys', []) # 多边形坐标 [[[x1,y1],[x2,y2],[x3,y3],[x4,y4]], ...]
  834. rec_boxes = res_data.get('rec_boxes', []) # 边界框 [[x1,y1,x2,y2], ...]
  835. if not rec_texts:
  836. print(f" [DEBUG] 第 {i+1} 个区域: PaddleOCR未识别到文字")
  837. continue
  838. text_lines_with_bbox = []
  839. all_texts = []
  840. all_char_boxes_list = []
  841. # 关键改进:先收集所有文本行和它们的边界框,然后对整个文本区域进行字符检测
  842. # 这样可以确保OpenCV检测到所有字符,而不仅仅是单个文本行的字符
  843. # 解析PaddleOCR结果,收集所有文本行
  844. all_text_lines = [] # 存储所有文本行及其边界框
  845. for idx, text in enumerate(rec_texts):
  846. if not text or not text.strip():
  847. continue
  848. # 获取置信度
  849. confidence = float(rec_scores[idx]) if idx < len(rec_scores) else 0.9
  850. # 获取坐标(优先使用多边形坐标,如果没有则使用边界框)
  851. if idx < len(rec_polys) and rec_polys[idx]:
  852. bbox_coords = rec_polys[idx] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
  853. elif idx < len(rec_boxes) and rec_boxes[idx]:
  854. # 将边界框转换为多边形格式
  855. box = rec_boxes[idx] # [x1, y1, x2, y2]
  856. bbox_coords = [
  857. [box[0], box[1]], # 左上
  858. [box[2], box[1]], # 右上
  859. [box[2], box[3]], # 右下
  860. [box[0], box[3]] # 左下
  861. ]
  862. else:
  863. print(f" [DEBUG] 第 {i+1} 个区域: 第 {idx} 个文本没有坐标信息")
  864. continue
  865. if text and text.strip():
  866. # 计算边界框
  867. # 检查 bbox_coords 格式
  868. if not isinstance(bbox_coords, (list, tuple)) or len(bbox_coords) < 4:
  869. print(f" [DEBUG] 第 {i+1} 个区域: bbox_coords 格式不正确: {type(bbox_coords)}, {bbox_coords}")
  870. continue
  871. # 检查每个坐标点格式
  872. try:
  873. x_coords = []
  874. y_coords = []
  875. for coord in bbox_coords:
  876. if isinstance(coord, (list, tuple)) and len(coord) >= 2:
  877. x_coords.append(coord[0])
  878. y_coords.append(coord[1])
  879. else:
  880. print(f" [DEBUG] 第 {i+1} 个区域: 坐标点格式不正确: {coord}")
  881. break
  882. if not x_coords or not y_coords or len(x_coords) < 4:
  883. print(f" [DEBUG] 第 {i+1} 个区域: 无法提取足够的坐标点")
  884. continue
  885. except (TypeError, IndexError) as e:
  886. print(f" [DEBUG] 第 {i+1} 个区域: 解析坐标失败: {e}, bbox_coords={bbox_coords}")
  887. continue
  888. left = min(x_coords)
  889. top = min(y_coords)
  890. right = max(x_coords)
  891. bottom = max(y_coords)
  892. # 转换为绝对坐标(相对于原图)
  893. char_bbox = {
  894. 'x1': float(x1 + left),
  895. 'y1': float(y1 + top),
  896. 'x2': float(x1 + right),
  897. 'y2': float(y1 + bottom),
  898. 'center_x': float(x1 + (left + right) / 2),
  899. 'center_y': float(y1 + (top + bottom) / 2)
  900. }
  901. text_lines_with_bbox.append({
  902. 'text': text,
  903. 'bbox': char_bbox,
  904. 'confidence': confidence
  905. })
  906. all_texts.append((text, confidence))
  907. # 收集文本行信息,稍后统一处理
  908. all_text_lines.append({
  909. 'text': text,
  910. 'bbox': char_bbox,
  911. 'confidence': confidence
  912. })
  913. # 关键改进:对所有文本行合并后的整个区域进行字符检测
  914. if all_text_lines:
  915. # 计算整个文本区域的边界框(包含所有文本行)
  916. all_x1 = [line['bbox']['x1'] for line in all_text_lines]
  917. all_y1 = [line['bbox']['y1'] for line in all_text_lines]
  918. all_x2 = [line['bbox']['x2'] for line in all_text_lines]
  919. all_y2 = [line['bbox']['y2'] for line in all_text_lines]
  920. combined_bbox = {
  921. 'x1': float(min(all_x1)),
  922. 'y1': float(min(all_y1)),
  923. 'x2': float(max(all_x2)),
  924. 'y2': float(max(all_y2)),
  925. 'center_x': float((min(all_x1) + max(all_x2)) / 2),
  926. 'center_y': float((min(all_y1) + max(all_y2)) / 2)
  927. }
  928. # 合并所有文本行的文本
  929. combined_text_for_detection = ''.join([line['text'] for line in all_text_lines])
  930. # 使用OpenCV检测整个文本区域的所有字符
  931. # 注意:text_bbox_for_detection必须使用绝对坐标(相对于原图)
  932. # 因为detect_characters_with_opencv函数期望的是原图坐标
  933. text_bbox_for_detection = {
  934. 'x1': combined_bbox['x1'],
  935. 'y1': combined_bbox['y1'],
  936. 'x2': combined_bbox['x2'],
  937. 'y2': combined_bbox['y2']
  938. }
  939. # 使用OpenCV检测字符位置(需要传入原图img,而不是crop_img)
  940. # 注意:坐标是相对于原图的,所以需要传入原图
  941. # 传入OCR的边界框作为参考,提高识别率
  942. detected_char_boxes = detect_characters_with_opencv(
  943. img, text_bbox_for_detection, combined_text_for_detection, ocr_bbox_hint=combined_bbox
  944. )
  945. # 调试输出:检查OpenCV是否识别出所有字符
  946. if '远道' in combined_text_for_detection or '石田' in combined_text_for_detection:
  947. print(f" [DEBUG] 合并后OCR文本: {combined_text_for_detection}")
  948. text_no_space_debug = combined_text_for_detection.replace(' ', '')
  949. print(f" [DEBUG] 去除空格后: {text_no_space_debug}, 字符数: {len(text_no_space_debug)}")
  950. print(f" [DEBUG] OpenCV检测到的字符框数: {len(detected_char_boxes)}")
  951. if len(detected_char_boxes) > 0:
  952. print(f" [DEBUG] 前3个字符框位置: center_x={[b['center_x'] for b in detected_char_boxes[:3]]}, center_y={[b['center_y'] for b in detected_char_boxes[:3]]}")
  953. text_no_space = combined_text_for_detection.replace(' ', '')
  954. if len(detected_char_boxes) > 0 and len(detected_char_boxes) == len(text_no_space):
  955. # 使用OpenCV检测到的精确位置
  956. # 关键:OpenCV检测的字符框顺序可能与OCR文本顺序不一致
  957. # 需要根据字符框的位置来匹配字符,而不是简单地按索引对应
  958. # 方法1:将字符框按位置排序(在OpenCV坐标系中:从上到下、从右到左)
  959. # 注意:detect_characters_with_opencv函数返回的字符框可能已经按某种顺序排列
  960. # 但我们需要确保按照正确的阅读顺序(从上到下、从右到左)排序
  961. sorted_char_boxes = sorted(detected_char_boxes, key=lambda b: (b['y1'], -b['center_x']))
  962. # 反转文本字符,使其与字符框的位置顺序对应
  963. reversed_text_chars = list(text_no_space[::-1])
  964. # 将排序后的字符框与反转后的文本字符对应
  965. for k, char_box in enumerate(sorted_char_boxes):
  966. char = reversed_text_chars[k] if k < len(reversed_text_chars) else '?'
  967. all_char_boxes_list.append({
  968. 'char': char,
  969. 'x1': char_box['x1'],
  970. 'y1': char_box['y1'],
  971. 'x2': char_box['x2'],
  972. 'y2': char_box['y2'],
  973. 'center_x': char_box['center_x'],
  974. 'center_y': char_box['center_y']
  975. })
  976. else:
  977. # 如果OpenCV检测失败,回退到估算方法
  978. if len(text_no_space) > 0:
  979. bbox_width = right - left
  980. bbox_height = bottom - top
  981. is_vertical = bbox_height > bbox_width * 1.2
  982. if is_vertical:
  983. # 竖排:字符从上到下(y坐标从小到大)
  984. char_height = bbox_height / len(text_no_space)
  985. for k, char in enumerate(text_no_space):
  986. char_x = char_bbox['center_x']
  987. char_y = char_bbox['y1'] + char_height * (k + 0.5)
  988. all_char_boxes_list.append({
  989. 'char': char,
  990. 'x1': char_x - 5,
  991. 'y1': char_y - char_height/2,
  992. 'x2': char_x + 5,
  993. 'y2': char_y + char_height/2,
  994. 'center_x': char_x,
  995. 'center_y': char_y
  996. })
  997. else:
  998. # 横排:字符从左到右(估算)
  999. char_width = bbox_width / len(text_no_space)
  1000. for k, char in enumerate(text_no_space):
  1001. char_x = char_bbox['x1'] + char_width * (k + 0.5)
  1002. char_y = char_bbox['center_y']
  1003. all_char_boxes_list.append({
  1004. 'char': char,
  1005. 'x1': char_x - char_width/2,
  1006. 'y1': char_y - 5,
  1007. 'x2': char_x + char_width/2,
  1008. 'y2': char_y + 5,
  1009. 'center_x': char_x,
  1010. 'center_y': char_y
  1011. })
  1012. # 合并所有文字
  1013. if all_texts:
  1014. # 先对文字行进行排序(从右到左、从上到下)
  1015. # 注意:对于日式漫画,阅读顺序是从右到左、从上到下
  1016. # 排序规则:先按Y坐标从上到下(y1越小越靠上),然后按X坐标从右到左(center_x越大越靠右)
  1017. if len(text_lines_with_bbox) > 1:
  1018. text_lines_with_bbox.sort(key=lambda line: (line['bbox']['y1'], -line['bbox']['center_x']))
  1019. # 从排序后的text_lines_with_bbox中提取文本
  1020. text_lines = [line['text'] for line in text_lines_with_bbox]
  1021. combined_text = ' '.join(text_lines)
  1022. avg_confidence = sum([t[1] for t in all_texts]) / len(all_texts) if all_texts else 0.0
  1023. # 使用字符位置信息
  1024. character_positions = []
  1025. if all_char_boxes_list and len(all_char_boxes_list) > 0:
  1026. # 获取图片高度(用于坐标转换)
  1027. img_height = img.shape[0]
  1028. # 注意:字符框已经在前面按位置排序并与文本字符对应了
  1029. # 这里不需要再次排序,保持字符与坐标的对应关系
  1030. # 直接使用all_char_boxes_list,保持字符与坐标的对应关系
  1031. for char_box in all_char_boxes_list:
  1032. # 将坐标转换为数学坐标系(左下角为原点,向上为y轴正方向)
  1033. # 转换中心坐标
  1034. center_x_old = char_box['center_x']
  1035. center_y_old = char_box['center_y']
  1036. center_x_new, center_y_new = convert_coordinate_to_math_system(center_x_old, center_y_old, img_height)
  1037. # 转换边界框坐标(用于更精确的位置信息)
  1038. x1_old = char_box['x1']
  1039. y1_old = char_box['y1']
  1040. x2_old = char_box['x2']
  1041. y2_old = char_box['y2']
  1042. x1_new, y1_new = convert_coordinate_to_math_system(x1_old, y1_old, img_height)
  1043. x2_new, y2_new = convert_coordinate_to_math_system(x2_old, y2_old, img_height)
  1044. # 注意:在数学坐标系中,y1_new > y2_new(因为y1在原图中更靠上,转换后y值更大)
  1045. # 所以需要确保y1是上边界(y值更大),y2是下边界(y值更小)
  1046. y1_math = max(y1_new, y2_new) # 上边界(y值更大)
  1047. y2_math = min(y1_new, y2_new) # 下边界(y值更小)
  1048. character_positions.append({
  1049. 'x': center_x_new, # 转换后的中心x坐标(数学坐标系)
  1050. 'y': center_y_new, # 转换后的中心y坐标(数学坐标系)
  1051. 'center_x': center_x_new, # 转换后的中心x坐标
  1052. 'center_y': center_y_new, # 转换后的中心y坐标
  1053. 'x1': min(x1_new, x2_new), # 转换后的左边界x坐标
  1054. 'y1': y1_math, # 转换后的上边界y坐标(数学坐标系中y值更大)
  1055. 'x2': max(x1_new, x2_new), # 转换后的右边界x坐标
  1056. 'y2': y2_math, # 转换后的下边界y坐标(数学坐标系中y值更小)
  1057. 'x_old': center_x_old, # 保留原始中心x坐标(用于调试)
  1058. 'y_old': center_y_old # 保留原始中心y坐标(用于调试)
  1059. })
  1060. # 如果字符位置数量不匹配,清空
  1061. text_no_space_for_check = combined_text.replace(' ', '')
  1062. if len(character_positions) != len(text_no_space_for_check):
  1063. if '远道' in combined_text or '石田' in combined_text:
  1064. print(f" [DEBUG] 字符位置数量不匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}, text=\"{combined_text}\"")
  1065. character_positions = []
  1066. elif '远道' in combined_text or '石田' in combined_text:
  1067. print(f" [DEBUG] 字符位置数量匹配: character_positions={len(character_positions)}, text长度={len(text_no_space_for_check)}")
  1068. print(f" [DEBUG] 前3个character_positions: {[{'x': p.get('center_x', p.get('x', 0)), 'y': p.get('center_y', p.get('y', 0))} for p in character_positions[:3]]}")
  1069. # 调试输出:检查character_positions
  1070. if ('远道' in combined_text or '石田' in combined_text) and character_positions:
  1071. print(f" [DEBUG] 保存到dialogues: text=\"{combined_text}\", character_positions数量={len(character_positions)}")
  1072. if combined_text and combined_text.strip():
  1073. dialogues.append({
  1074. 'order': i + 1,
  1075. 'text': combined_text,
  1076. 'bbox': text_block['bbox'],
  1077. 'confidence': avg_confidence,
  1078. 'character_positions': character_positions if character_positions else None
  1079. })
  1080. text_preview = combined_text[:30] + '...' if len(combined_text) > 30 else combined_text
  1081. print(f" [{i+1}/{len(blk_list)}] 识别: {text_preview} (置信度: {avg_confidence:.2f})")
  1082. else:
  1083. print(f" [DEBUG] 第 {i+1} 个区域: combined_text为空 (all_texts长度: {len(all_texts)})")
  1084. else:
  1085. print(f" [DEBUG] 第 {i+1} 个区域未识别到文字 (all_texts为空)")
  1086. except Exception as e:
  1087. print(f" [WARN] PaddleOCR解析第 {i+1} 个区域结果失败: {e}")
  1088. import traceback
  1089. traceback.print_exc()
  1090. continue
  1091. except Exception as e:
  1092. print(f" [WARN] PaddleOCR识别第 {i+1} 个区域失败: {e}")
  1093. import traceback
  1094. traceback.print_exc()
  1095. continue
  1096. else:
  1097. # 只使用PaddleOCR,如果失败则报错
  1098. raise RuntimeError(f"OCR引擎不是PaddleOCR,当前引擎: {ocr_engine}")
  1099. except Exception as e:
  1100. print(f" [WARN] 识别第 {i+1} 个区域失败: {e}")
  1101. import traceback
  1102. traceback.print_exc()
  1103. continue
  1104. print(f"[OK] 成功识别 {len(dialogues)} 段文字")
  1105. # 步骤6: 按格子位置排序(越往上、越往右的格子里的对话顺序越靠前)
  1106. print("[INFO] 步骤6: 按格子位置排序...")
  1107. sorted_dialogues = sort_text_blocks_by_panels(dialogues, panels, im_w, im_h)
  1108. # 重新分配order,保留order、text、bbox和character_positions字段
  1109. formatted_dialogues = []
  1110. for i, dialogue in enumerate(sorted_dialogues, 1):
  1111. formatted_dialogues.append({
  1112. 'order': i,
  1113. 'text': dialogue['text'],
  1114. 'bbox': dialogue.get('bbox', {}), # 保留bbox信息用于排序
  1115. 'character_positions': dialogue.get('character_positions') # 保留字符位置信息用于字符排序
  1116. })
  1117. # 步骤7: 保存JSON结果到output_dir(ocr目录,最终结果)
  1118. print("[INFO] 步骤7: 保存JSON结果...")
  1119. result = {
  1120. 'image_file': image_path.name,
  1121. 'reading_order': '从右到左、从上到下(日式漫画阅读顺序)',
  1122. 'dialogues': formatted_dialogues,
  1123. 'total_count': len(formatted_dialogues)
  1124. }
  1125. # 保存JSON到output_dir(ocr目录,最终结果文件)
  1126. json_path = output_dir / f"{image_name}_dialogues.json"
  1127. with open(json_path, 'w', encoding='utf-8') as f:
  1128. json.dump(result, f, ensure_ascii=False, indent=2)
  1129. print(f"[OK] 已保存对白结果: {json_path}")
  1130. return result
  1131. def batch_detect_and_ocr(image_dir, model_path=None, output_dir=None):
  1132. """
  1133. 批量处理目录下所有图片
  1134. """
  1135. image_dir = Path(image_dir)
  1136. if not image_dir.exists():
  1137. raise FileNotFoundError(f"图片目录不存在: {image_dir}")
  1138. # 获取所有图片文件,按文件名数字排序
  1139. image_files = []
  1140. for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.webp']:
  1141. image_files.extend(image_dir.glob(ext))
  1142. image_files.extend(image_dir.glob(ext.upper()))
  1143. # 按文件名开头的数字排序
  1144. image_files = sorted(image_files, key=lambda x: int(x.stem.split('_')[0]) if x.stem.split('_')[0].isdigit() else 0)
  1145. print(f"[INFO] 找到 {len(image_files)} 张图片")
  1146. # 设置输出目录
  1147. if output_dir is None:
  1148. output_dir = image_dir / 'ocr'
  1149. else:
  1150. output_dir = Path(output_dir)
  1151. output_dir.mkdir(parents=True, exist_ok=True)
  1152. results = []
  1153. for i, image_file in enumerate(image_files, 1):
  1154. print(f"\n[{i}/{len(image_files)}] 处理: {image_file.name}")
  1155. try:
  1156. result = detect_and_ocr_comic(image_file, model_path, output_dir)
  1157. results.append(result)
  1158. except Exception as e:
  1159. print(f"[ERROR] 处理 {image_file.name} 失败: {e}")
  1160. import traceback
  1161. traceback.print_exc()
  1162. continue
  1163. print(f"\n[OK] 批量处理完成,成功处理 {len(results)} 张图片")
  1164. return results
  1165. if __name__ == '__main__':
  1166. import argparse
  1167. parser = argparse.ArgumentParser(description='检测漫画文字区域并用OCR识别')
  1168. parser.add_argument('input', help='输入图片路径或目录')
  1169. parser.add_argument('-o', '--output', help='输出目录')
  1170. parser.add_argument('-m', '--model', help='comic-text-detector模型路径')
  1171. args = parser.parse_args()
  1172. input_path = Path(args.input)
  1173. if input_path.is_file():
  1174. detect_and_ocr_comic(input_path, args.model, args.output)
  1175. elif input_path.is_dir():
  1176. batch_detect_and_ocr(input_path, args.model, args.output)
  1177. else:
  1178. print(f"[ERROR] 输入路径不存在: {input_path}")
  1179. sys.exit(1)