yichael
/
AIStoryBoard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
							# -*- coding: utf-8 -*-
"""
图像预处理：提高OCR准确率
包括：对比度增强、去噪、锐化、二值化
"""

import sys
import cv2
import numpy as np
from pathlib import Path

# Windows编码修复
if sys.platform == 'win32':
    import io
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')


def preprocess_image_for_ocr(input_path, output_path):
    """
    对图像进行预处理以提高OCR准确率（针对黑底白字漫画优化）
    
    步骤：
    1. 检测背景类型（黑底白字 or 白底黑字）
    2. 颜色反转（如果是黑底白字，转换为白底黑字，OCR模型通常训练在白底黑字上）
    3. 提高对比度：使用 CLAHE 自适应直方图均衡化
    4. 去噪：使用 cv2.fastNlMeansDenoising
    5. 锐化：使用锐化核增强文字边缘
    6. 二值化：使用 OTSU 或自适应阈值，确保文字清晰
    7. 形态学操作：去除小噪点，填充空洞
    
    参数:
        input_path: 输入图片路径
        output_path: 输出图片路径
    
    返回:
        处理后的图片（numpy数组）
    """
    # 读取图片（处理中文路径）
    img_array = np.fromfile(str(input_path), dtype=np.uint8)
    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
    
    if img is None:
        raise ValueError(f"无法读取图片: {input_path}")
    
    print(f"[INFO] 读取图片: {Path(input_path).name}")
    print(f"[INFO] 原始图片尺寸: {img.shape[1]}x{img.shape[0]}")
    
    # 转换为灰度图（如果还不是）
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = img.copy()
    
    # 步骤1: 检测背景类型（黑底白字 or 白底黑字）
    mean_brightness = np.mean(gray)
    is_dark_background = mean_brightness < 127  # 平均亮度小于127认为是黑底
    
    print(f"[INFO] 步骤1: 检测背景类型...")
    print(f"    平均亮度: {mean_brightness:.1f} ({'黑底白字' if is_dark_background else '白底黑字'})")
    
    # 步骤2: 颜色反转（如果是黑底白字，转换为白底黑字）
    # OCR模型通常训练在白底黑字上，所以需要反转
    if is_dark_background:
        print("[INFO] 步骤2: 颜色反转（黑底白字 -> 白底黑字）...")
        gray = cv2.bitwise_not(gray)
    else:
        print("[INFO] 步骤2: 跳过反转（已是白底黑字）")
    
    # 步骤3: 提高对比度 - 使用 CLAHE 自适应直方图均衡化
    # 对于黑底白字反转后的图片，适度增强对比度
    print("[INFO] 步骤3: 提高对比度（CLAHE）...")
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    
    # 步骤4: 去噪 - 使用 fastNlMeansDenoising
    # 对于反转后的图片，去噪参数可以稍微降低
    print("[INFO] 步骤4: 去噪...")
    denoised = cv2.fastNlMeansDenoising(enhanced, None, h=10, templateWindowSize=7, searchWindowSize=21)
    
    # 步骤5: 锐化 - 使用锐化核增强文字边缘
    print("[INFO] 步骤5: 锐化...")
    # 创建锐化核
    sharpen_kernel = np.array([
        [0, -1, 0],
        [-1, 5, -1],
        [0, -1, 0]
    ])
    sharpened = cv2.filter2D(denoised, -1, sharpen_kernel)
    
    # 步骤6: 二值化 - 使用 OTSU 阈值，然后进行更严格的二值化
    print("[INFO] 步骤6: 二值化（OTSU + 自适应阈值）...")
    # 使用OTSU自动阈值（反转后应该是白底黑字，OTSU效果会很好）
    otsu_thresh, binary_otsu = cv2.threshold(sharpened, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # 如果OTSU阈值太低，使用更严格的阈值确保文字区域更干净
    # 对于白底黑字，我们希望文字（黑色）更纯，背景（白色）更干净
    if otsu_thresh < 127:
        # 使用更严格的阈值，确保文字区域更黑（更干净）
        _, binary_strict = cv2.threshold(sharpened, otsu_thresh + 10, 255, cv2.THRESH_BINARY)
        binary_otsu = binary_strict
    
    # 步骤7: 形态学操作，去除小噪点，填充文字内部空洞，清理文字内部
    print("[INFO] 步骤7: 形态学操作（去噪点、填充空洞、清理文字内部）...")
    
    # 先开运算：去除小的噪点（在文字外部）
    kernel_open_small = np.ones((2, 2), np.uint8)
    cleaned = cv2.morphologyEx(binary_otsu, cv2.MORPH_OPEN, kernel_open_small, iterations=1)
    
    # 闭运算：填充文字内部的小空洞
    kernel_close = np.ones((3, 3), np.uint8)
    filled = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel_close, iterations=2)
    
    # 再次开运算：清理文字边缘的小突起
    kernel_open_edge = np.ones((2, 2), np.uint8)
    result = cv2.morphologyEx(filled, cv2.MORPH_OPEN, kernel_open_edge, iterations=1)
    
    # 额外的清理步骤：使用连通域分析去除小的噪点
    # 找到所有连通域
    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(255 - result, connectivity=8)
    
    # 计算平均连通域面积（排除背景）
    if num_labels > 1:
        areas = stats[1:, cv2.CC_STAT_AREA]
        if len(areas) > 0:
            # 计算中位数面积，作为阈值
            median_area = np.median(areas)
            min_area = max(10, median_area * 0.1)  # 至少保留10像素，或中位数的10%
            
            # 创建清理后的mask
            cleaned_mask = np.zeros_like(result)
            for i in range(1, num_labels):
                if stats[i, cv2.CC_STAT_AREA] >= min_area:
                    # 保留这个连通域
                    cleaned_mask[labels == i] = 255
            
            # 反转回来（因为连通域分析是在反转图像上做的）
            result = 255 - cleaned_mask
    
    # 保存结果（处理中文路径）
    success, encoded_img = cv2.imencode('.png', result)
    if success:
        encoded_img.tofile(str(output_path))
        print(f"[OK] 预处理完成，已保存: {Path(output_path).name}")
    else:
        raise ValueError(f"保存图片失败: {output_path}")
    
    return result


if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("用法: python preprocess_image.py <输入图片路径> <输出图片路径>")
        sys.exit(1)
    
    input_path = sys.argv[1]
    output_path = sys.argv[2]
    
    try:
        preprocess_image_for_ocr(input_path, output_path)
    except Exception as e:
        print(f"[ERROR] 预处理失败: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)