yichael
/
AIStoryBoard


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
							/**
 * 步骤：
 * 1. 创建startOcrDialogBlockReg()函数
 * 2. 接收参数：imagePath（图片路径）
 * 3. 接收参数：textBlocksJsonPath（文字区域json文件路径）
 * 4. 接收参数：textRegionImagePath（文字区域图片路径）
 * 5. 利用onnxOCR识别图片中的文字区域，通过OCR返回的坐标信息计算出文字区域（绿色框）
 * 6. 保存json文件到指定路径下
 */

import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { execSync } from 'child_process';
import { getPythonPath } from './python-path.js';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const projectRoot = path.join(__dirname, '..');

/**
 * 步骤1: 创建startOcrDialogBlockReg()函数
 * @param {string} imagePath - 步骤2: 图片路径参数
 * @param {string} textBlocksJsonPath - 步骤3: 文字区域json文件路径参数
 * @param {string} textRegionImagePath - 步骤4: 绘制文字区域图片路径参数
 * @param {Object} ocrConfig - OnnxOCR配置参数（可选）
 * @returns {Object} OCR识别结果
 */
async function startOcrDialogBlockReg(imagePath, textBlocksJsonPath, textRegionImagePath, ocrConfig = {}) {
  try {
    console.log('🔍 开始OCR文字识别');
    
    // 步骤2: 接收参数：imagePath（图片路径）
    if (!imagePath) {
      throw new Error('步骤2失败: imagePath 参数不能为空');
    }
    
    console.log(`📷 步骤2: 图片路径 - ${imagePath}`);
    
    // 验证图片文件存在
    if (!fs.existsSync(imagePath)) {
      throw new Error(`步骤2失败: 图片文件不存在 - ${imagePath}`);
    }
    
    // 步骤3: 接收参数：textBlocksJsonPath（文字区域json文件路径）
    if (!textBlocksJsonPath) {
      throw new Error('步骤3失败: textBlocksJsonPath 参数不能为空');
    }
    
    console.log(`📄 步骤3: JSON文件路径 - ${textBlocksJsonPath}`);
    
    // 步骤4: 接收参数：textRegionImagePath（文字区域图片路径）
    if (!textRegionImagePath) {
      throw new Error('步骤4失败: textRegionImagePath 参数不能为空');
    }
    
    console.log(`🖼️ 步骤4: 图片文件路径 - ${textRegionImagePath}`);
    
    // 从textBlocksJsonPath推断输出目录
    const outputDir = path.dirname(textBlocksJsonPath);
    
    // 确保输出目录存在
    if (!fs.existsSync(outputDir)) {
      fs.mkdirSync(outputDir, { recursive: true });
    }
    
    // 步骤4: 利用onnxOCR识别图片中的文字区域，通过OCR返回的坐标信息计算出文字区域
    console.log('\n🔍 步骤4: 开始OCR识别和文字区域计算...');
    if (Object.keys(ocrConfig).length > 0) {
      console.log('⚙️ 使用自定义OCR配置:', ocrConfig);
    }
    const ocrData = await performOcrRecognition(imagePath, outputDir, ocrConfig);
    
    // 步骤5: 保存json文件到指定路径
    console.log('\n💾 步骤5: 保存结果到指定文件...');
    const savedResult = await saveJsonToSpecificPath(ocrData, textBlocksJsonPath, textRegionImagePath);
    
    console.log('✅ 所有步骤完成！');
    console.log(`📊 识别到 ${savedResult.totalCount} 个文字块`);
    console.log(`📄 结果已保存: ${savedResult.jsonPath}`);
    
    return savedResult;
    
  } catch (error) {
    console.error(`❌ OCR处理失败: ${error.message}`);
    throw error;
  }
}

/**
 * 步骤4: 利用onnxOCR识别图片中的文字区域，通过OCR返回的坐标信息计算出文字区域（绿色框）
 * @param {string} imagePath - 图片路径
 * @param {string} outputDir - 输出目录
 * @param {Object} ocrConfig - OnnxOCR配置参数
 * @returns {Object} OCR识别数据
 */
async function performOcrRecognition(imagePath, outputDir, ocrConfig = {}) {
  const pythonEnv = getPythonPath();
  const ocrScript = path.join(projectRoot, 'python', 'generate-anim', 'ocr_with_onnxocr.py');
  
  // 检查OCR脚本是否存在
  if (!fs.existsSync(ocrScript)) {
    throw new Error(`步骤4失败: OCR脚本不存在 - ${ocrScript}`);
  }
  
  console.log('🤖 调用OnnxOCR引擎...');
  
  // 构建OCR命令和配置参数
  const absImagePath = path.resolve(imagePath);
  const absOutputDir = path.resolve(outputDir);
  
  // 将配置参数序列化为JSON字符串传递给Python脚本
  const configJson = JSON.stringify(ocrConfig);
  const command = `"${pythonEnv}" "${ocrScript}" "${absImagePath}" "" "${absOutputDir}" "${configJson}"`;
  
  // 执行OCR识别
  execSync(command, {
    encoding: 'utf-8',
    stdio: 'inherit',
    cwd: projectRoot,
    env: { 
      ...process.env, 
      PYTHONIOENCODING: 'utf-8',
      PYTHONUTF8: '1'
    },
    shell: true
  });
  
  console.log('📍 计算文字区域坐标...');
  
  // 读取OCR结果并计算文字区域
  const imageName = path.basename(imagePath, path.extname(imagePath));
  const dialoguesJsonPath = path.join(outputDir, `${imageName}_dialogues.json`);
  
  // 等待OCR结果文件生成
  await waitForFileGeneration(dialoguesJsonPath);
  
  // 读取和解析OCR结果
  const jsonContent = fs.readFileSync(dialoguesJsonPath, 'utf-8');
  const ocrResult = JSON.parse(jsonContent);
  
  if (!ocrResult.dialogues || !Array.isArray(ocrResult.dialogues)) {
    throw new Error('步骤4失败: OCR结果格式不正确，缺少dialogues数组');
  }
  
  // 计算文字区域（绿色框）
  const textRegions = calculateTextRegions(ocrResult.dialogues);
  
  console.log(`✅ 步骤4完成: 识别到 ${textRegions.length} 个文字区域`);
  
  return {
    imagePath: imagePath,
    imageName: imageName,
    textRegions: textRegions,
    originalOcrResult: ocrResult
  };
}

/**
 * 计算文字区域坐标信息（绿色框）
 * @param {Array} dialogues - OCR识别的对话数组
 * @returns {Array} 文字区域数组
 */
function calculateTextRegions(dialogues) {
  return dialogues.map((dialogue, index) => {
    let bbox = null;
    
    // 处理OCR返回的坐标信息
    if (dialogue.bbox && Array.isArray(dialogue.bbox)) {
      const points = dialogue.bbox;
      const xCoords = points.map(p => p[0]);
      const yCoords = points.map(p => p[1]);
      
      // 计算矩形边界框（绿色框区域）
      bbox = {
        x1: Math.min(...xCoords),
        y1: Math.min(...yCoords),
        x2: Math.max(...xCoords),
        y2: Math.max(...yCoords)
      };
      
      // 计算区域属性
      bbox.width = bbox.x2 - bbox.x1;
      bbox.height = bbox.y2 - bbox.y1;
      bbox.center_x = (bbox.x1 + bbox.x2) / 2;
      bbox.center_y = (bbox.y1 + bbox.y2) / 2;
      bbox.area = bbox.width * bbox.height;
    }
    
    return {
      region_index: index + 1,
      text: dialogue.text || '',
      confidence: dialogue.confidence || 0,
      bbox: bbox,
      green_box_coordinates: bbox  // 绿色框坐标
    };
  });
}

/**
 * 步骤5: 保存json文件到指定路径并绘制带绿色框的图片
 * @param {Object} ocrData - OCR数据
 * @param {string} textBlocksJsonPath - 指定的JSON文件路径
 * @param {string} textRegionImagePath - 指定的图片文件路径
 * @returns {Object} 保存结果
 */
async function saveJsonToSpecificPath(ocrData, textBlocksJsonPath, textRegionImagePath) {
  console.log(`📝 保存JSON到: ${path.basename(textBlocksJsonPath)}`);
  console.log(`🖼️ 绘制图片到: ${path.basename(textRegionImagePath)}`);
  
  // 构造结果数据
  const resultData = {
    image_file: path.basename(ocrData.imagePath),
    processing_time: new Date().toISOString(),
    text_regions: ocrData.textRegions,
    total_count: ocrData.textRegions.length,
    ocr_engine: "OnnxOCR",
    output_files: {
      json_file: textBlocksJsonPath,
      region_image: textRegionImagePath
    },
    green_box_info: {
      description: "绿色框坐标信息用于标注文字区域",
      coordinate_system: "左上角为原点，x向右递增，y向下递增"
    }
  };
  
  // 步骤5: 保存json文件到指定路径
  fs.writeFileSync(textBlocksJsonPath, JSON.stringify(resultData, null, 2), 'utf-8');
  console.log(`✅ JSON文件已保存: ${path.basename(textBlocksJsonPath)}`);
  
  // 步骤6: 绘制带绿色框的文字区域图片
  console.log(`🎨 绘制绿色框标注图片...`);
  await drawTextRegionsWithGreenBoxes(ocrData.imagePath, ocrData.textRegions, textRegionImagePath);
  console.log(`✅ 绿色框图片已保存: ${path.basename(textRegionImagePath)}`);
  
  return {
    jsonPath: textBlocksJsonPath,
    textRegionImagePath: textRegionImagePath,
    outputDir: path.dirname(textBlocksJsonPath),
    textRegions: ocrData.textRegions,
    totalCount: ocrData.textRegions.length
  };
}

/**
 * 绘制带绿色框的文字区域图片
 * @param {string} originalImagePath - 原图片路径
 * @param {Array} textRegions - 文字区域数组
 * @param {string} outputImagePath - 输出图片路径
 */
async function drawTextRegionsWithGreenBoxes(originalImagePath, textRegions, outputImagePath) {
  const pythonEnv = getPythonPath();
  const drawScript = path.join(projectRoot, 'python', 'generate-anim', 'draw_text_regions.py');
  
  // 如果Python脚本不存在，创建一个
  if (!fs.existsSync(drawScript)) {
    console.log('🔧 创建绘制脚本...');
    await createDrawTextRegionsScript(drawScript);
  }
  
  // 创建临时JSON文件传递文字区域数据
  const tempJsonPath = path.join(path.dirname(outputImagePath), 'temp_text_regions.json');
  const tempData = {
    image_path: originalImagePath,
    text_regions: textRegions
  };
  fs.writeFileSync(tempJsonPath, JSON.stringify(tempData, null, 2), 'utf-8');
  
  try {
    // 调用Python脚本绘制绿色框
    const command = `"${pythonEnv}" "${drawScript}" "${tempJsonPath}" "${outputImagePath}"`;
    
    execSync(command, {
      encoding: 'utf-8',
      stdio: 'inherit',
      cwd: projectRoot,
      env: {
        ...process.env,
        PYTHONIOENCODING: 'utf-8',
        PYTHONUTF8: '1'
      },
      shell: true
    });
    
    // 等待文件生成
    await waitForFileGeneration(outputImagePath);
    
  } finally {
    // 清理临时文件
    if (fs.existsSync(tempJsonPath)) {
      fs.unlinkSync(tempJsonPath);
    }
  }
}

/**
 * 创建绘制文字区域脚本
 * @param {string} scriptPath - 脚本路径
 */
async function createDrawTextRegionsScript(scriptPath) {
  const scriptContent = `import cv2
import numpy as np
import json
import sys
from pathlib import Path

def draw_text_regions_with_green_boxes(temp_json_path, output_image_path):
    try:
        # 确保输出编码为UTF-8
        sys.stdout.reconfigure(encoding='utf-8')
        sys.stderr.reconfigure(encoding='utf-8')
        
        print(f"[INFO] 读取文字区域数据: {temp_json_path}")
        with open(temp_json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        original_image_path = data['image_path']
        text_regions = data['text_regions']
        
        print(f"[INFO] 读取原图片: {original_image_path}")
        
        # 读取原图片（支持中文路径）
        img_array = np.fromfile(str(original_image_path), dtype=np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        
        if img is None:
            raise ValueError(f"无法读取图片: {original_image_path}")
        
        print(f"[INFO] 图片尺寸: {img.shape[:2][::-1]} (宽x高)")
        print(f"[INFO] 文字区域数量: {len(text_regions)}")
        
        # 绘制每个文字区域的绿色框
        for i, region in enumerate(text_regions):
            bbox = region.get('bbox') or region.get('green_box_coordinates')
            text = region.get('text', '')
            
            if bbox and all(k in bbox for k in ['x1', 'y1', 'x2', 'y2']):
                # 提取坐标
                x1, y1 = int(bbox['x1']), int(bbox['y1'])
                x2, y2 = int(bbox['x2']), int(bbox['y2'])
                
                # 绘制绿色矩形框（BGR格式，绿色为(0, 255, 0)）
                cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # 可选：添加文字标签
                if text and len(text) > 0:
                    # 在框的上方添加文字（如果空间足够）
                    label_y = max(y1 - 5, 15)
                    cv2.putText(img, f"{i+1}", (x1, label_y), 
                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
        
        # 保存结果图片
        output_path = Path(output_image_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # 使用cv2.imencode处理中文路径
        success, encoded_img = cv2.imencode('.png', img)
        if success:
            with open(str(output_path), 'wb') as f:
                f.write(encoded_img.tobytes())
            print(f"[OK] 绿色框图片已保存: {output_path}")
        else:
            raise ValueError("图片编码失败")
        
        print(f"[SUCCESS] 成功绘制 {len(text_regions)} 个文字区域的绿色框")
        
    except Exception as e:
        print(f"[ERROR] 绘制绿色框失败: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage: python draw_text_regions.py <temp_json_path> <output_image_path>")
        sys.exit(1)
    
    temp_json_file = sys.argv[1]
    output_img = sys.argv[2]
    draw_text_regions_with_green_boxes(temp_json_file, output_img)
`;
  
  // 确保目录存在
  const scriptDir = path.dirname(scriptPath);
  if (!fs.existsSync(scriptDir)) {
    fs.mkdirSync(scriptDir, { recursive: true });
  }
  
  // 写入脚本文件
  fs.writeFileSync(scriptPath, scriptContent, 'utf-8');
  console.log(`✅ 绘制脚本已创建: ${path.basename(scriptPath)}`);
}

/**
 * 等待文件生成
 * @param {string} filePath - 文件路径
 */
async function waitForFileGeneration(filePath) {
  let retries = 50;
  while (retries > 0 && !fs.existsSync(filePath)) {
    await new Promise(resolve => setTimeout(resolve, 100));
    retries--;
  }
  
  if (!fs.existsSync(filePath)) {
    throw new Error(`步骤4失败: OCR结果文件未生成 - ${filePath}`);
  }
}

/**
 * 创建OnnxOCR配置对象
 * @param {Object} customConfig - 自定义配置参数
 * @returns {Object} 完整的OnnxOCR配置
 */
function createOcrConfig(customConfig = {}) {
  const defaultConfig = {
    // 基础设置
    use_angle_cls: true,          // 启用角度分类器
    use_gpu: false,               // 使用GPU（false=CPU）
    
    // 文字检测参数（影响检测精度）
    det_db_thresh: 0.2,           // 文字检测阈值（越小越敏感，0.1-0.5）
    det_db_box_thresh: 0.5,       // 文字框置信度阈值（越小检测更多，0.3-0.8）
    det_limit_side_len: 1280,     // 图片处理尺寸（越大精度越高，960/1280/1536）
    det_db_unclip_ratio: 1.5,     // 文字框扩展比例（1.2-2.0）
    det_box_type: "quad",         // 检测框类型（quad/poly）
    
    // 文字识别参数（影响识别精度）
    drop_score: 0.3,              // 识别置信度阈值（越小保留更多，0.2-0.6）
    rec_image_shape: "3, 48, 320", // 识别图片尺寸（高度影响精度）
    rec_batch_num: 6,             // 识别批处理大小
    max_text_length: 25,          // 最大文字长度
    
    // 高级参数
    use_dilation: false,          // 使用膨胀操作
    det_db_score_mode: "fast",    // 分数计算模式（fast/slow）
    
    // 预设配置
    preset: "high_precision"       // 预设配置（balanced/high_precision/fast）
  };
  
  return { ...defaultConfig, ...customConfig };
}

/**
 * 获取预设配置
 * @param {string} presetName - 预设名称（balanced/high_precision/fast）
 * @returns {Object} 预设配置
 */
function getPresetConfig(presetName) {
  const presets = {
    // 平衡配置（默认）
    balanced: {
      det_db_thresh: 0.3,
      det_db_box_thresh: 0.6,
      det_limit_side_len: 960,
      drop_score: 0.5,
      use_angle_cls: true
    },
    
    // 高精度配置（识别更准确，但速度较慢）
    high_precision: {
      det_db_thresh: 0.2,
      det_db_box_thresh: 0.5,
      det_limit_side_len: 1280,
      drop_score: 0.3,
      use_angle_cls: true,
      det_db_unclip_ratio: 1.6
    },
    
    // 快速配置（速度优先，精度稍低）
    fast: {
      det_db_thresh: 0.4,
      det_db_box_thresh: 0.7,
      det_limit_side_len: 640,
      drop_score: 0.6,
      use_angle_cls: false,
      det_db_unclip_ratio: 1.4
    }
  };
  
  return presets[presetName] || presets.balanced;
}

export { 
  startOcrDialogBlockReg,
  createOcrConfig,
  getPresetConfig
};