ocr-dialog-block-reg.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. /**
  2. * 步骤:
  3. * 1. 创建startOcrDialogBlockReg()函数
  4. * 2. 接收参数:imagePath(图片路径)
  5. * 3. 接收参数:textBlocksJsonPath(文字区域json文件路径)
  6. * 4. 接收参数:textRegionImagePath(文字区域图片路径)
  7. * 5. 利用onnxOCR识别图片中的文字区域,通过OCR返回的坐标信息计算出文字区域(绿色框)
  8. * 6. 保存json文件到指定路径下
  9. */
  10. import fs from 'fs';
  11. import path from 'path';
  12. import { fileURLToPath } from 'url';
  13. import { execSync } from 'child_process';
  14. import { getPythonPath } from './python-path.js';
  15. const __filename = fileURLToPath(import.meta.url);
  16. const __dirname = path.dirname(__filename);
  17. const projectRoot = path.join(__dirname, '..');
  18. /**
  19. * 步骤1: 创建startOcrDialogBlockReg()函数
  20. * @param {string} imagePath - 步骤2: 图片路径参数
  21. * @param {string} textBlocksJsonPath - 步骤3: 文字区域json文件路径参数
  22. * @param {string} textRegionImagePath - 步骤4: 绘制文字区域图片路径参数
  23. * @param {Object} ocrConfig - OnnxOCR配置参数(可选)
  24. * @returns {Object} OCR识别结果
  25. */
  26. async function startOcrDialogBlockReg(imagePath, textBlocksJsonPath, textRegionImagePath, ocrConfig = {}) {
  27. try {
  28. console.log('🔍 开始OCR文字识别');
  29. // 步骤2: 接收参数:imagePath(图片路径)
  30. if (!imagePath) {
  31. throw new Error('步骤2失败: imagePath 参数不能为空');
  32. }
  33. console.log(`📷 步骤2: 图片路径 - ${imagePath}`);
  34. // 验证图片文件存在
  35. if (!fs.existsSync(imagePath)) {
  36. throw new Error(`步骤2失败: 图片文件不存在 - ${imagePath}`);
  37. }
  38. // 步骤3: 接收参数:textBlocksJsonPath(文字区域json文件路径)
  39. if (!textBlocksJsonPath) {
  40. throw new Error('步骤3失败: textBlocksJsonPath 参数不能为空');
  41. }
  42. console.log(`📄 步骤3: JSON文件路径 - ${textBlocksJsonPath}`);
  43. // 步骤4: 接收参数:textRegionImagePath(文字区域图片路径)
  44. if (!textRegionImagePath) {
  45. throw new Error('步骤4失败: textRegionImagePath 参数不能为空');
  46. }
  47. console.log(`🖼️ 步骤4: 图片文件路径 - ${textRegionImagePath}`);
  48. // 从textBlocksJsonPath推断输出目录
  49. const outputDir = path.dirname(textBlocksJsonPath);
  50. // 确保输出目录存在
  51. if (!fs.existsSync(outputDir)) {
  52. fs.mkdirSync(outputDir, { recursive: true });
  53. }
  54. // 步骤4: 利用onnxOCR识别图片中的文字区域,通过OCR返回的坐标信息计算出文字区域
  55. console.log('\n🔍 步骤4: 开始OCR识别和文字区域计算...');
  56. if (Object.keys(ocrConfig).length > 0) {
  57. console.log('⚙️ 使用自定义OCR配置:', ocrConfig);
  58. }
  59. const ocrData = await performOcrRecognition(imagePath, outputDir, ocrConfig);
  60. // 步骤5: 保存json文件到指定路径
  61. console.log('\n💾 步骤5: 保存结果到指定文件...');
  62. const savedResult = await saveJsonToSpecificPath(ocrData, textBlocksJsonPath, textRegionImagePath);
  63. console.log('✅ 所有步骤完成!');
  64. console.log(`📊 识别到 ${savedResult.totalCount} 个文字块`);
  65. console.log(`📄 结果已保存: ${savedResult.jsonPath}`);
  66. return savedResult;
  67. } catch (error) {
  68. console.error(`❌ OCR处理失败: ${error.message}`);
  69. throw error;
  70. }
  71. }
  72. /**
  73. * 步骤4: 利用onnxOCR识别图片中的文字区域,通过OCR返回的坐标信息计算出文字区域(绿色框)
  74. * @param {string} imagePath - 图片路径
  75. * @param {string} outputDir - 输出目录
  76. * @param {Object} ocrConfig - OnnxOCR配置参数
  77. * @returns {Object} OCR识别数据
  78. */
  79. async function performOcrRecognition(imagePath, outputDir, ocrConfig = {}) {
  80. const pythonEnv = getPythonPath();
  81. const ocrScript = path.join(projectRoot, 'python', 'generate-anim', 'ocr_with_onnxocr.py');
  82. // 检查OCR脚本是否存在
  83. if (!fs.existsSync(ocrScript)) {
  84. throw new Error(`步骤4失败: OCR脚本不存在 - ${ocrScript}`);
  85. }
  86. console.log('🤖 调用OnnxOCR引擎...');
  87. // 构建OCR命令和配置参数
  88. const absImagePath = path.resolve(imagePath);
  89. const absOutputDir = path.resolve(outputDir);
  90. // 将配置参数序列化为JSON字符串传递给Python脚本
  91. const configJson = JSON.stringify(ocrConfig);
  92. const command = `"${pythonEnv}" "${ocrScript}" "${absImagePath}" "" "${absOutputDir}" "${configJson}"`;
  93. // 执行OCR识别
  94. execSync(command, {
  95. encoding: 'utf-8',
  96. stdio: 'inherit',
  97. cwd: projectRoot,
  98. env: {
  99. ...process.env,
  100. PYTHONIOENCODING: 'utf-8',
  101. PYTHONUTF8: '1'
  102. },
  103. shell: true
  104. });
  105. console.log('📍 计算文字区域坐标...');
  106. // 读取OCR结果并计算文字区域
  107. const imageName = path.basename(imagePath, path.extname(imagePath));
  108. const dialoguesJsonPath = path.join(outputDir, `${imageName}_dialogues.json`);
  109. // 等待OCR结果文件生成
  110. await waitForFileGeneration(dialoguesJsonPath);
  111. // 读取和解析OCR结果
  112. const jsonContent = fs.readFileSync(dialoguesJsonPath, 'utf-8');
  113. const ocrResult = JSON.parse(jsonContent);
  114. if (!ocrResult.dialogues || !Array.isArray(ocrResult.dialogues)) {
  115. throw new Error('步骤4失败: OCR结果格式不正确,缺少dialogues数组');
  116. }
  117. // 计算文字区域(绿色框)
  118. const textRegions = calculateTextRegions(ocrResult.dialogues);
  119. console.log(`✅ 步骤4完成: 识别到 ${textRegions.length} 个文字区域`);
  120. return {
  121. imagePath: imagePath,
  122. imageName: imageName,
  123. textRegions: textRegions,
  124. originalOcrResult: ocrResult
  125. };
  126. }
  127. /**
  128. * 计算文字区域坐标信息(绿色框)
  129. * @param {Array} dialogues - OCR识别的对话数组
  130. * @returns {Array} 文字区域数组
  131. */
  132. function calculateTextRegions(dialogues) {
  133. return dialogues.map((dialogue, index) => {
  134. let bbox = null;
  135. // 处理OCR返回的坐标信息
  136. if (dialogue.bbox && Array.isArray(dialogue.bbox)) {
  137. const points = dialogue.bbox;
  138. const xCoords = points.map(p => p[0]);
  139. const yCoords = points.map(p => p[1]);
  140. // 计算矩形边界框(绿色框区域)
  141. bbox = {
  142. x1: Math.min(...xCoords),
  143. y1: Math.min(...yCoords),
  144. x2: Math.max(...xCoords),
  145. y2: Math.max(...yCoords)
  146. };
  147. // 计算区域属性
  148. bbox.width = bbox.x2 - bbox.x1;
  149. bbox.height = bbox.y2 - bbox.y1;
  150. bbox.center_x = (bbox.x1 + bbox.x2) / 2;
  151. bbox.center_y = (bbox.y1 + bbox.y2) / 2;
  152. bbox.area = bbox.width * bbox.height;
  153. }
  154. return {
  155. region_index: index + 1,
  156. text: dialogue.text || '',
  157. confidence: dialogue.confidence || 0,
  158. bbox: bbox,
  159. green_box_coordinates: bbox // 绿色框坐标
  160. };
  161. });
  162. }
  163. /**
  164. * 步骤5: 保存json文件到指定路径并绘制带绿色框的图片
  165. * @param {Object} ocrData - OCR数据
  166. * @param {string} textBlocksJsonPath - 指定的JSON文件路径
  167. * @param {string} textRegionImagePath - 指定的图片文件路径
  168. * @returns {Object} 保存结果
  169. */
  170. async function saveJsonToSpecificPath(ocrData, textBlocksJsonPath, textRegionImagePath) {
  171. console.log(`📝 保存JSON到: ${path.basename(textBlocksJsonPath)}`);
  172. console.log(`🖼️ 绘制图片到: ${path.basename(textRegionImagePath)}`);
  173. // 构造结果数据
  174. const resultData = {
  175. image_file: path.basename(ocrData.imagePath),
  176. processing_time: new Date().toISOString(),
  177. text_regions: ocrData.textRegions,
  178. total_count: ocrData.textRegions.length,
  179. ocr_engine: "OnnxOCR",
  180. output_files: {
  181. json_file: textBlocksJsonPath,
  182. region_image: textRegionImagePath
  183. },
  184. green_box_info: {
  185. description: "绿色框坐标信息用于标注文字区域",
  186. coordinate_system: "左上角为原点,x向右递增,y向下递增"
  187. }
  188. };
  189. // 步骤5: 保存json文件到指定路径
  190. fs.writeFileSync(textBlocksJsonPath, JSON.stringify(resultData, null, 2), 'utf-8');
  191. console.log(`✅ JSON文件已保存: ${path.basename(textBlocksJsonPath)}`);
  192. // 步骤6: 绘制带绿色框的文字区域图片
  193. console.log(`🎨 绘制绿色框标注图片...`);
  194. await drawTextRegionsWithGreenBoxes(ocrData.imagePath, ocrData.textRegions, textRegionImagePath);
  195. console.log(`✅ 绿色框图片已保存: ${path.basename(textRegionImagePath)}`);
  196. return {
  197. jsonPath: textBlocksJsonPath,
  198. textRegionImagePath: textRegionImagePath,
  199. outputDir: path.dirname(textBlocksJsonPath),
  200. textRegions: ocrData.textRegions,
  201. totalCount: ocrData.textRegions.length
  202. };
  203. }
  204. /**
  205. * 绘制带绿色框的文字区域图片
  206. * @param {string} originalImagePath - 原图片路径
  207. * @param {Array} textRegions - 文字区域数组
  208. * @param {string} outputImagePath - 输出图片路径
  209. */
  210. async function drawTextRegionsWithGreenBoxes(originalImagePath, textRegions, outputImagePath) {
  211. const pythonEnv = getPythonPath();
  212. const drawScript = path.join(projectRoot, 'python', 'generate-anim', 'draw_text_regions.py');
  213. // 如果Python脚本不存在,创建一个
  214. if (!fs.existsSync(drawScript)) {
  215. console.log('🔧 创建绘制脚本...');
  216. await createDrawTextRegionsScript(drawScript);
  217. }
  218. // 创建临时JSON文件传递文字区域数据
  219. const tempJsonPath = path.join(path.dirname(outputImagePath), 'temp_text_regions.json');
  220. const tempData = {
  221. image_path: originalImagePath,
  222. text_regions: textRegions
  223. };
  224. fs.writeFileSync(tempJsonPath, JSON.stringify(tempData, null, 2), 'utf-8');
  225. try {
  226. // 调用Python脚本绘制绿色框
  227. const command = `"${pythonEnv}" "${drawScript}" "${tempJsonPath}" "${outputImagePath}"`;
  228. execSync(command, {
  229. encoding: 'utf-8',
  230. stdio: 'inherit',
  231. cwd: projectRoot,
  232. env: {
  233. ...process.env,
  234. PYTHONIOENCODING: 'utf-8',
  235. PYTHONUTF8: '1'
  236. },
  237. shell: true
  238. });
  239. // 等待文件生成
  240. await waitForFileGeneration(outputImagePath);
  241. } finally {
  242. // 清理临时文件
  243. if (fs.existsSync(tempJsonPath)) {
  244. fs.unlinkSync(tempJsonPath);
  245. }
  246. }
  247. }
  248. /**
  249. * 创建绘制文字区域脚本
  250. * @param {string} scriptPath - 脚本路径
  251. */
  252. async function createDrawTextRegionsScript(scriptPath) {
  253. const scriptContent = `import cv2
  254. import numpy as np
  255. import json
  256. import sys
  257. from pathlib import Path
  258. def draw_text_regions_with_green_boxes(temp_json_path, output_image_path):
  259. try:
  260. # 确保输出编码为UTF-8
  261. sys.stdout.reconfigure(encoding='utf-8')
  262. sys.stderr.reconfigure(encoding='utf-8')
  263. print(f"[INFO] 读取文字区域数据: {temp_json_path}")
  264. with open(temp_json_path, 'r', encoding='utf-8') as f:
  265. data = json.load(f)
  266. original_image_path = data['image_path']
  267. text_regions = data['text_regions']
  268. print(f"[INFO] 读取原图片: {original_image_path}")
  269. # 读取原图片(支持中文路径)
  270. img_array = np.fromfile(str(original_image_path), dtype=np.uint8)
  271. img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
  272. if img is None:
  273. raise ValueError(f"无法读取图片: {original_image_path}")
  274. print(f"[INFO] 图片尺寸: {img.shape[:2][::-1]} (宽x高)")
  275. print(f"[INFO] 文字区域数量: {len(text_regions)}")
  276. # 绘制每个文字区域的绿色框
  277. for i, region in enumerate(text_regions):
  278. bbox = region.get('bbox') or region.get('green_box_coordinates')
  279. text = region.get('text', '')
  280. if bbox and all(k in bbox for k in ['x1', 'y1', 'x2', 'y2']):
  281. # 提取坐标
  282. x1, y1 = int(bbox['x1']), int(bbox['y1'])
  283. x2, y2 = int(bbox['x2']), int(bbox['y2'])
  284. # 绘制绿色矩形框(BGR格式,绿色为(0, 255, 0))
  285. cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
  286. # 可选:添加文字标签
  287. if text and len(text) > 0:
  288. # 在框的上方添加文字(如果空间足够)
  289. label_y = max(y1 - 5, 15)
  290. cv2.putText(img, f"{i+1}", (x1, label_y),
  291. cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
  292. # 保存结果图片
  293. output_path = Path(output_image_path)
  294. output_path.parent.mkdir(parents=True, exist_ok=True)
  295. # 使用cv2.imencode处理中文路径
  296. success, encoded_img = cv2.imencode('.png', img)
  297. if success:
  298. with open(str(output_path), 'wb') as f:
  299. f.write(encoded_img.tobytes())
  300. print(f"[OK] 绿色框图片已保存: {output_path}")
  301. else:
  302. raise ValueError("图片编码失败")
  303. print(f"[SUCCESS] 成功绘制 {len(text_regions)} 个文字区域的绿色框")
  304. except Exception as e:
  305. print(f"[ERROR] 绘制绿色框失败: {e}")
  306. import traceback
  307. traceback.print_exc()
  308. sys.exit(1)
  309. if __name__ == '__main__':
  310. if len(sys.argv) != 3:
  311. print("Usage: python draw_text_regions.py <temp_json_path> <output_image_path>")
  312. sys.exit(1)
  313. temp_json_file = sys.argv[1]
  314. output_img = sys.argv[2]
  315. draw_text_regions_with_green_boxes(temp_json_file, output_img)
  316. `;
  317. // 确保目录存在
  318. const scriptDir = path.dirname(scriptPath);
  319. if (!fs.existsSync(scriptDir)) {
  320. fs.mkdirSync(scriptDir, { recursive: true });
  321. }
  322. // 写入脚本文件
  323. fs.writeFileSync(scriptPath, scriptContent, 'utf-8');
  324. console.log(`✅ 绘制脚本已创建: ${path.basename(scriptPath)}`);
  325. }
  326. /**
  327. * 等待文件生成
  328. * @param {string} filePath - 文件路径
  329. */
  330. async function waitForFileGeneration(filePath) {
  331. let retries = 50;
  332. while (retries > 0 && !fs.existsSync(filePath)) {
  333. await new Promise(resolve => setTimeout(resolve, 100));
  334. retries--;
  335. }
  336. if (!fs.existsSync(filePath)) {
  337. throw new Error(`步骤4失败: OCR结果文件未生成 - ${filePath}`);
  338. }
  339. }
  340. /**
  341. * 创建OnnxOCR配置对象
  342. * @param {Object} customConfig - 自定义配置参数
  343. * @returns {Object} 完整的OnnxOCR配置
  344. */
  345. function createOcrConfig(customConfig = {}) {
  346. const defaultConfig = {
  347. // 基础设置
  348. use_angle_cls: true, // 启用角度分类器
  349. use_gpu: false, // 使用GPU(false=CPU)
  350. // 文字检测参数(影响检测精度)
  351. det_db_thresh: 0.2, // 文字检测阈值(越小越敏感,0.1-0.5)
  352. det_db_box_thresh: 0.5, // 文字框置信度阈值(越小检测更多,0.3-0.8)
  353. det_limit_side_len: 1280, // 图片处理尺寸(越大精度越高,960/1280/1536)
  354. det_db_unclip_ratio: 1.5, // 文字框扩展比例(1.2-2.0)
  355. det_box_type: "quad", // 检测框类型(quad/poly)
  356. // 文字识别参数(影响识别精度)
  357. drop_score: 0.3, // 识别置信度阈值(越小保留更多,0.2-0.6)
  358. rec_image_shape: "3, 48, 320", // 识别图片尺寸(高度影响精度)
  359. rec_batch_num: 6, // 识别批处理大小
  360. max_text_length: 25, // 最大文字长度
  361. // 高级参数
  362. use_dilation: false, // 使用膨胀操作
  363. det_db_score_mode: "fast", // 分数计算模式(fast/slow)
  364. // 预设配置
  365. preset: "high_precision" // 预设配置(balanced/high_precision/fast)
  366. };
  367. return { ...defaultConfig, ...customConfig };
  368. }
  369. /**
  370. * 获取预设配置
  371. * @param {string} presetName - 预设名称(balanced/high_precision/fast)
  372. * @returns {Object} 预设配置
  373. */
  374. function getPresetConfig(presetName) {
  375. const presets = {
  376. // 平衡配置(默认)
  377. balanced: {
  378. det_db_thresh: 0.3,
  379. det_db_box_thresh: 0.6,
  380. det_limit_side_len: 960,
  381. drop_score: 0.5,
  382. use_angle_cls: true
  383. },
  384. // 高精度配置(识别更准确,但速度较慢)
  385. high_precision: {
  386. det_db_thresh: 0.2,
  387. det_db_box_thresh: 0.5,
  388. det_limit_side_len: 1280,
  389. drop_score: 0.3,
  390. use_angle_cls: true,
  391. det_db_unclip_ratio: 1.6
  392. },
  393. // 快速配置(速度优先,精度稍低)
  394. fast: {
  395. det_db_thresh: 0.4,
  396. det_db_box_thresh: 0.7,
  397. det_limit_side_len: 640,
  398. drop_score: 0.6,
  399. use_angle_cls: false,
  400. det_db_unclip_ratio: 1.4
  401. }
  402. };
  403. return presets[presetName] || presets.balanced;
  404. }
  405. export {
  406. startOcrDialogBlockReg,
  407. createOcrConfig,
  408. getPresetConfig
  409. };