编辑
2025-03-17
技术
00
请注意,本文编写于 335 天前,最后修改于 335 天前,其中某些信息可能已经过时。

目录

import pdfplumber
import pandas as pd
腾讯云API配置
创建腾讯云客户端

代码如下:

python
# import pdfplumber # import pandas as pd import re import os from pdf2image import convert_from_path from PIL import Image from openpyxl import Workbook import time import io import json import base64 from tencentcloud.common import credential from tencentcloud.common.profile.client_profile import ClientProfile from tencentcloud.common.profile.http_profile import HttpProfile from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException from tencentcloud.ocr.v20181119 import ocr_client, models # 腾讯云API配置 SECRET_ID = '' SECRET_KEY = '' # 创建腾讯云客户端 def create_ocr_client(): try: cred = credential.Credential(SECRET_ID, SECRET_KEY) httpProfile = HttpProfile() httpProfile.endpoint = "ocr.tencentcloudapi.com" clientProfile = ClientProfile() clientProfile.httpProfile = httpProfile return ocr_client.OcrClient(cred, "ap-guangzhou", clientProfile) except Exception as e: print(f"创建腾讯云客户端失败: {str(e)}") return None def extract_text_from_image(image): """使用腾讯云表格识别V3从图片中提取文本""" # 将PIL Image转换为base64编码 img_byte_arr = io.BytesIO() image.save(img_byte_arr, format='PNG') img_byte_arr = img_byte_arr.getvalue() img_base64 = base64.b64encode(img_byte_arr).decode('utf-8') # 创建OCR客户端 client = create_ocr_client() if not client: return '' # 调用腾讯云表格识别V3 API try: req = models.RecognizeTableAccurateOCRRequest() req.ImageBase64 = img_base64 # req.IsPdf = False response = client.RecognizeTableAccurateOCR(req) # 解析表格数据 table_text = [] for table in response.TableDetections: for cell in table.Cells: if cell.Text: table_text.append(cell.Text) text = '\n'.join(table_text) print('\n表格识别结果:') print('-' * 50) print(text) print('-' * 50) return text except TencentCloudSDKException as err: print(f'OCR API调用出错: {err}') return '' except Exception as e: print(f'OCR处理出错: {str(e)}') return '' def extract_container_numbers(pdf_path): """从PDF文件中提取集装箱号和可能存在OCR错误的箱号""" container_numbers = [] potential_numbers = [] # 将PDF转换为图片 images = convert_from_path(pdf_path) # 遍历每一页图片 for image in images: # 使用OCR提取文本 text = extract_text_from_image(image) # 打印OCR识别的文本内容 print("\nOCR识别的文本内容:") print(text) print("\n开始匹配集装箱号...") # 使用正则表达式匹配标准集装箱号 matches = re.finditer(r'\b[A-Z]{4}\d{7}\b', text) for match in matches: container_number = match.group() print(f"找到匹配: {container_number}") if container_number not in container_numbers: # 避免重复 container_numbers.append(container_number) print(f"已添加到结果列表: {container_number}") else: print(f"重复项,已跳过: {container_number}") # 匹配可能存在OCR错误的箱号(包含多种混淆字符的情况) potential_matches = re.finditer(r'\b[A-Z0O1Il2Z]{4}\d{7}\b', text) for match in potential_matches: number = match.group() # 检查是否包含可能混淆的字符 if any(c in number[:4] for c in 'O1Il2Z') and number not in container_numbers: # 创建所有可能的正确形式 corrections = [number] # 分离前四位字母和后七位数字 letters = number[:4] digits = number[4:] # 只对字母部分进行替换 if '0' in letters: corrections.append(letters.replace('0', 'O') + digits) if '1' in letters: corrections.append(letters.replace('1', 'I') + digits) if 'l' in letters: corrections.append(letters.replace('l', 'I') + digits) if '2' in letters: corrections.append(letters.replace('2', 'Z') + digits) # 添加到可能存在错误的列表中 if number not in potential_numbers: potential_numbers.append((number, corrections[1:])) return container_numbers, potential_numbers def save_to_excel(container_numbers, potential_numbers, output_file): """将集装箱号和可能存在OCR错误的箱号保存到Excel文件的不同工作表中""" # 创建Excel工作簿 wb = Workbook() # 创建标准箱号工作表 ws1 = wb.active ws1.title = '标准箱号' ws1.append(['集装箱号']) for number in container_numbers: ws1.append([number]) # 创建可能存在OCR错误的箱号工作表 ws2 = wb.create_sheet('可能存在OCR错误的箱号') ws2.append(['集装箱号', '可能的正确形式']) for number, corrections in potential_numbers: ws2.append([number, ', '.join(corrections)]) # 保存Excel文件 wb.save(output_file) print(f'成功导出{len(container_numbers)}个标准箱号和{len(potential_numbers)}个可能存在OCR错误的箱号到{output_file}') def process_pdf_directory(directory): """处理目录下的所有PDF文件""" all_container_numbers = [] all_potential_numbers = [] # 遍历目录下的所有PDF文件 for filename in os.listdir(directory): if filename.lower().endswith('.pdf'): pdf_path = os.path.join(directory, filename) print(f'正在处理: {filename}') # 提取集装箱号和可能存在OCR错误的箱号 container_numbers, potential_numbers = extract_container_numbers(pdf_path) all_container_numbers.extend(container_numbers) all_potential_numbers.extend(potential_numbers) return all_container_numbers, all_potential_numbers def main(): # 获取当前目录 current_dir = os.getcwd() # 处理所有PDF文件 print('开始提取集装箱号...') container_numbers, potential_numbers = process_pdf_directory(current_dir) if container_numbers or potential_numbers: # 保存结果到Excel output_file = 'container_numbers.xlsx' save_to_excel(container_numbers, potential_numbers, output_file) else: print('未找到任何集装箱号') if __name__ == '__main__': main()

本文作者:ivan

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!