Files
SearchCompany/parse_img/process_manchu_dict.py
manchuwork 102dd78c26 aiqicha
2025-09-25 03:19:34 +08:00

86 lines
2.8 KiB
Python

import os
import cv2
from paddleocr import PaddleOCR
import numpy as np
def imread_chinese(path):
"""支持中文路径的图像读取函数"""
try:
# 使用 numpy 读取文件
img_array = np.fromfile(path, dtype=np.uint8)
# 使用 imdecode 解码图像
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
return img
except Exception as e:
print(f"读取图像失败 {path}: {e}")
return None
def split_image_vertically(img_path, split_num=3):
"""将图片垂直分割为三部分(上中下栏)"""
#img = cv2.imread(img_path)
img = imread_chinese(img_path)
if img is None:
print(f"无法读取图像: {img_path}")
return None
height = img.shape[0]
section_height = height // split_num
return [
img[i * section_height:(i + 1) * section_height, :]
for i in range(split_num)
]
def detect_vertical_text(ocr, img_section):
"""识别竖直英文文本"""
# 将图像旋转90度使竖直文本变为水平
rotated = cv2.rotate(img_section, cv2.ROTATE_90_CLOCKWISE)
result = ocr.predict(rotated, use_textline_orientation=True)
return [line[1][0] for line in result[0]] if result else []
def process_images(image_dir, start_num=1, end_num=1097):
"""批量处理图片序列"""
ocr = PaddleOCR(
lang='en',
use_textline_orientation=True,
text_det_unclip_ratio=2.0, # 调整检测框扩展系数
#rec_char_dict_path='en_dict.txt' # 英文专用字典
)
for i in range(start_num, end_num + 1):
img_path = os.path.join(image_dir, f"{i}.png")
if not os.path.exists(img_path):
continue
sections = split_image_vertically(img_path)
page_results = {
"page_number": i,
"sections": []
}
for idx, section in enumerate(sections):
# 识别页码(假设位于第一栏顶部)
if idx == 0:
page_results["detected_page"] = detect_vertical_text(ocr, section[:50, :])
# 识别各栏英文内容
eng_text = detect_vertical_text(ocr, section)
page_results["sections"].append({
"section": ["top", "middle", "bottom"][idx],
"english_text": eng_text
})
yield page_results
if __name__ == "__main__":
IMAGE_DIR = r"D:/gitstudy/pythonwork/manchuspider/data/满洲语字典/满汉大辞典/images"
for result in process_images(IMAGE_DIR):
print(f"Page {result['page_number']}:")
print(f"Detected Page No: {result.get('detected_page', 'N/A')}")
for section in result["sections"]:
print(f"{section['section']} section English: {', '.join(section['english_text'])}")
print("-" * 50)