SearchCompany/parse_img/process_manchu_dict.py

import os
import cv2
from paddleocr import PaddleOCR
import numpy as np

def imread_chinese(path):
    """支持中文路径的图像读取函数"""
    try:
        # 使用 numpy 读取文件
        img_array = np.fromfile(path, dtype=np.uint8)
        # 使用 imdecode 解码图像
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        return img
    except Exception as e:
        print(f"读取图像失败 {path}: {e}")
        return None

def split_image_vertically(img_path, split_num=3):
    """将图片垂直分割为三部分（上中下栏）"""
    #img = cv2.imread(img_path)
    img = imread_chinese(img_path)

    if img is None:
        print(f"无法读取图像: {img_path}")
        return None

    height = img.shape[0]
    section_height = height // split_num
    return [
        img[i * section_height:(i + 1) * section_height, :]
        for i in range(split_num)
    ]


def detect_vertical_text(ocr, img_section):
    """识别竖直英文文本"""
    # 将图像旋转90度使竖直文本变为水平
    rotated = cv2.rotate(img_section, cv2.ROTATE_90_CLOCKWISE)
    result = ocr.predict(rotated, use_textline_orientation=True)
    return [line[1][0] for line in result[0]] if result else []


def process_images(image_dir, start_num=1, end_num=1097):
    """批量处理图片序列"""
    ocr = PaddleOCR(
        lang='en',
        use_textline_orientation=True,
        text_det_unclip_ratio=2.0,  # 调整检测框扩展系数
        #rec_char_dict_path='en_dict.txt'  # 英文专用字典
    )

    for i in range(start_num, end_num + 1):
        img_path = os.path.join(image_dir, f"{i}.png")
        if not os.path.exists(img_path):
            continue

        sections = split_image_vertically(img_path)
        page_results = {
            "page_number": i,
            "sections": []
        }

        for idx, section in enumerate(sections):
            # 识别页码（假设位于第一栏顶部）
            if idx == 0:
                page_results["detected_page"] = detect_vertical_text(ocr, section[:50, :])

            # 识别各栏英文内容
            eng_text = detect_vertical_text(ocr, section)
            page_results["sections"].append({
                "section": ["top", "middle", "bottom"][idx],
                "english_text": eng_text
            })

        yield page_results


if __name__ == "__main__":
    IMAGE_DIR = r"D:/gitstudy/pythonwork/manchuspider/data/满洲语字典/满汉大辞典/images"
    for result in process_images(IMAGE_DIR):
        print(f"Page {result['page_number']}:")
        print(f"Detected Page No: {result.get('detected_page', 'N/A')}")
        for section in result["sections"]:
            print(f"{section['section']} section English: {', '.join(section['english_text'])}")
        print("-" * 50)