扫描版PDF表格里提取不重复的箱号

代码如下：
python
# import pdfplumber
# import pandas as pd
import re
import os
from pdf2image import convert_from_path
from PIL import Image
from openpyxl import Workbook
import time
import io
import json
import base64
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models

# 腾讯云API配置
SECRET_ID = ''
SECRET_KEY = ''

# 创建腾讯云客户端
def create_ocr_client():
    try:
        cred = credential.Credential(SECRET_ID, SECRET_KEY)
        httpProfile = HttpProfile()
        httpProfile.endpoint = "ocr.tencentcloudapi.com"
        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        return ocr_client.OcrClient(cred, "ap-guangzhou", clientProfile)
    except Exception as e:
        print(f"创建腾讯云客户端失败: {str(e)}")
        return None

def extract_text_from_image(image):
    """使用腾讯云表格识别V3从图片中提取文本"""
    # 将PIL Image转换为base64编码
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format='PNG')
    img_byte_arr = img_byte_arr.getvalue()
    img_base64 = base64.b64encode(img_byte_arr).decode('utf-8')
    
    # 创建OCR客户端
    client = create_ocr_client()
    if not client:
        return ''
    
    # 调用腾讯云表格识别V3 API
    try:
        req = models.RecognizeTableAccurateOCRRequest()
        req.ImageBase64 = img_base64
        # req.IsPdf = False
        
        response = client.RecognizeTableAccurateOCR(req)
        # 解析表格数据
        table_text = []
        for table in response.TableDetections:
            for cell in table.Cells:
                if cell.Text:
                    table_text.append(cell.Text)
        
        text = '\n'.join(table_text)
        
        print('\n表格识别结果:')
        print('-' * 50)
        print(text)
        print('-' * 50)
        return text
    except TencentCloudSDKException as err:
        print(f'OCR API调用出错: {err}')
        return ''
    except Exception as e:
        print(f'OCR处理出错: {str(e)}')
        return ''

def extract_container_numbers(pdf_path):
    """从PDF文件中提取集装箱号和可能存在OCR错误的箱号"""
    container_numbers = []
    potential_numbers = []
    
    # 将PDF转换为图片
    images = convert_from_path(pdf_path)
    
    # 遍历每一页图片
    for image in images:
        # 使用OCR提取文本
        text = extract_text_from_image(image)
        
        # 打印OCR识别的文本内容
        print("\nOCR识别的文本内容:")
        print(text)
        print("\n开始匹配集装箱号...")
        
        # 使用正则表达式匹配标准集装箱号
        matches = re.finditer(r'\b[A-Z]{4}\d{7}\b', text)
        for match in matches:
            container_number = match.group()
            print(f"找到匹配: {container_number}")
            if container_number not in container_numbers:  # 避免重复
                container_numbers.append(container_number)
                print(f"已添加到结果列表: {container_number}")
            else:
                print(f"重复项，已跳过: {container_number}")
        
        # 匹配可能存在OCR错误的箱号（包含多种混淆字符的情况）
        potential_matches = re.finditer(r'\b[A-Z0O1Il2Z]{4}\d{7}\b', text)
        for match in potential_matches:
            number = match.group()
            # 检查是否包含可能混淆的字符
            if any(c in number[:4] for c in 'O1Il2Z') and number not in container_numbers:
                # 创建所有可能的正确形式
                corrections = [number]
                # 分离前四位字母和后七位数字
                letters = number[:4]
                digits = number[4:]
                
                # 只对字母部分进行替换
                if '0' in letters:
                    corrections.append(letters.replace('0', 'O') + digits)
                if '1' in letters:
                    corrections.append(letters.replace('1', 'I') + digits)
                if 'l' in letters:
                    corrections.append(letters.replace('l', 'I') + digits)
                if '2' in letters:
                    corrections.append(letters.replace('2', 'Z') + digits)
                
                # 添加到可能存在错误的列表中
                if number not in potential_numbers:
                    potential_numbers.append((number, corrections[1:]))
    
    return container_numbers, potential_numbers

def save_to_excel(container_numbers, potential_numbers, output_file):
    """将集装箱号和可能存在OCR错误的箱号保存到Excel文件的不同工作表中"""
    # 创建Excel工作簿
    wb = Workbook()
    
    # 创建标准箱号工作表
    ws1 = wb.active
    ws1.title = '标准箱号'
    ws1.append(['集装箱号'])
    for number in container_numbers:
        ws1.append([number])
    
    # 创建可能存在OCR错误的箱号工作表
    ws2 = wb.create_sheet('可能存在OCR错误的箱号')
    ws2.append(['集装箱号', '可能的正确形式'])
    for number, corrections in potential_numbers:
        ws2.append([number, ', '.join(corrections)])
    
    # 保存Excel文件
    wb.save(output_file)
    print(f'成功导出{len(container_numbers)}个标准箱号和{len(potential_numbers)}个可能存在OCR错误的箱号到{output_file}')


def process_pdf_directory(directory):
    """处理目录下的所有PDF文件"""
    all_container_numbers = []
    all_potential_numbers = []
    
    # 遍历目录下的所有PDF文件
    for filename in os.listdir(directory):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            print(f'正在处理: {filename}')
            
            # 提取集装箱号和可能存在OCR错误的箱号
            container_numbers, potential_numbers = extract_container_numbers(pdf_path)
            all_container_numbers.extend(container_numbers)
            all_potential_numbers.extend(potential_numbers)
    
    return all_container_numbers, all_potential_numbers

def main():
    # 获取当前目录
    current_dir = os.getcwd()
    
    # 处理所有PDF文件
    print('开始提取集装箱号...')
    container_numbers, potential_numbers = process_pdf_directory(current_dir)
    
    if container_numbers or potential_numbers:
        # 保存结果到Excel
        output_file = 'container_numbers.xlsx'
        save_to_excel(container_numbers, potential_numbers, output_file)
    else:
        print('未找到任何集装箱号')

if __name__ == '__main__':
    main()