代码如下:
python# import pdfplumber
# import pandas as pd
import re
import os
from pdf2image import convert_from_path
from PIL import Image
from openpyxl import Workbook
import time
import io
import json
import base64
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
# 腾讯云API配置
SECRET_ID = ''
SECRET_KEY = ''
# 创建腾讯云客户端
def create_ocr_client():
try:
cred = credential.Credential(SECRET_ID, SECRET_KEY)
httpProfile = HttpProfile()
httpProfile.endpoint = "ocr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
return ocr_client.OcrClient(cred, "ap-guangzhou", clientProfile)
except Exception as e:
print(f"创建腾讯云客户端失败: {str(e)}")
return None
def extract_text_from_image(image):
"""使用腾讯云表格识别V3从图片中提取文本"""
# 将PIL Image转换为base64编码
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format='PNG')
img_byte_arr = img_byte_arr.getvalue()
img_base64 = base64.b64encode(img_byte_arr).decode('utf-8')
# 创建OCR客户端
client = create_ocr_client()
if not client:
return ''
# 调用腾讯云表格识别V3 API
try:
req = models.RecognizeTableAccurateOCRRequest()
req.ImageBase64 = img_base64
# req.IsPdf = False
response = client.RecognizeTableAccurateOCR(req)
# 解析表格数据
table_text = []
for table in response.TableDetections:
for cell in table.Cells:
if cell.Text:
table_text.append(cell.Text)
text = '\n'.join(table_text)
print('\n表格识别结果:')
print('-' * 50)
print(text)
print('-' * 50)
return text
except TencentCloudSDKException as err:
print(f'OCR API调用出错: {err}')
return ''
except Exception as e:
print(f'OCR处理出错: {str(e)}')
return ''
def extract_container_numbers(pdf_path):
"""从PDF文件中提取集装箱号和可能存在OCR错误的箱号"""
container_numbers = []
potential_numbers = []
# 将PDF转换为图片
images = convert_from_path(pdf_path)
# 遍历每一页图片
for image in images:
# 使用OCR提取文本
text = extract_text_from_image(image)
# 打印OCR识别的文本内容
print("\nOCR识别的文本内容:")
print(text)
print("\n开始匹配集装箱号...")
# 使用正则表达式匹配标准集装箱号
matches = re.finditer(r'\b[A-Z]{4}\d{7}\b', text)
for match in matches:
container_number = match.group()
print(f"找到匹配: {container_number}")
if container_number not in container_numbers: # 避免重复
container_numbers.append(container_number)
print(f"已添加到结果列表: {container_number}")
else:
print(f"重复项,已跳过: {container_number}")
# 匹配可能存在OCR错误的箱号(包含多种混淆字符的情况)
potential_matches = re.finditer(r'\b[A-Z0O1Il2Z]{4}\d{7}\b', text)
for match in potential_matches:
number = match.group()
# 检查是否包含可能混淆的字符
if any(c in number[:4] for c in 'O1Il2Z') and number not in container_numbers:
# 创建所有可能的正确形式
corrections = [number]
# 分离前四位字母和后七位数字
letters = number[:4]
digits = number[4:]
# 只对字母部分进行替换
if '0' in letters:
corrections.append(letters.replace('0', 'O') + digits)
if '1' in letters:
corrections.append(letters.replace('1', 'I') + digits)
if 'l' in letters:
corrections.append(letters.replace('l', 'I') + digits)
if '2' in letters:
corrections.append(letters.replace('2', 'Z') + digits)
# 添加到可能存在错误的列表中
if number not in potential_numbers:
potential_numbers.append((number, corrections[1:]))
return container_numbers, potential_numbers
def save_to_excel(container_numbers, potential_numbers, output_file):
"""将集装箱号和可能存在OCR错误的箱号保存到Excel文件的不同工作表中"""
# 创建Excel工作簿
wb = Workbook()
# 创建标准箱号工作表
ws1 = wb.active
ws1.title = '标准箱号'
ws1.append(['集装箱号'])
for number in container_numbers:
ws1.append([number])
# 创建可能存在OCR错误的箱号工作表
ws2 = wb.create_sheet('可能存在OCR错误的箱号')
ws2.append(['集装箱号', '可能的正确形式'])
for number, corrections in potential_numbers:
ws2.append([number, ', '.join(corrections)])
# 保存Excel文件
wb.save(output_file)
print(f'成功导出{len(container_numbers)}个标准箱号和{len(potential_numbers)}个可能存在OCR错误的箱号到{output_file}')
def process_pdf_directory(directory):
"""处理目录下的所有PDF文件"""
all_container_numbers = []
all_potential_numbers = []
# 遍历目录下的所有PDF文件
for filename in os.listdir(directory):
if filename.lower().endswith('.pdf'):
pdf_path = os.path.join(directory, filename)
print(f'正在处理: {filename}')
# 提取集装箱号和可能存在OCR错误的箱号
container_numbers, potential_numbers = extract_container_numbers(pdf_path)
all_container_numbers.extend(container_numbers)
all_potential_numbers.extend(potential_numbers)
return all_container_numbers, all_potential_numbers
def main():
# 获取当前目录
current_dir = os.getcwd()
# 处理所有PDF文件
print('开始提取集装箱号...')
container_numbers, potential_numbers = process_pdf_directory(current_dir)
if container_numbers or potential_numbers:
# 保存结果到Excel
output_file = 'container_numbers.xlsx'
save_to_excel(container_numbers, potential_numbers, output_file)
else:
print('未找到任何集装箱号')
if __name__ == '__main__':
main()
本文作者:ivan
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!