这边想读取PDF文档中的订单编号与发票号,并用其重新命名文件,这个有什么问题,总是报Failed to extract all necessary information from the PDF file。这个如何解决,改代码应该如何修改
代码如下:
import os
import re
import pdfplumber
def extract_info_from_pdf(pdf_path):
order_id = None
invoice_number = None
invoice_amount = None
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
# 订单编号固定为20位数字
order_id_match = re.search(r'\b\d{20}\b', text)
if order_id_match:
order_id = order_id_match.group(0)
# 发票号固定为8位数字
invoice_number_match = re.search(r'\b\d{8}\b', text)
if invoice_number_match and not invoice_number: # 假设每个PDF只有一个发票号
invoice_number = invoice_number_match.group(0)
# 发票金额包含“元”字,可能包含小数
invoice_amount_match = re.search(r'(\d+(\.\d+)?)元', text)
if invoice_amount_match:
invoice_amount = invoice_amount_match.group(1)
# 检查是否提取到所有必要信息
if all([order_id, invoice_number, invoice_amount]):
return order_id, invoice_number, invoice_amount, "Success"
else:
return None, None, None, "Failed to extract all necessary information from the PDF file."
def rename_file(old_path, new_name):
dir_name = os.path.dirname(old_path)
new_path = os.path.join(dir_name, new_name)
try:
os.rename(old_path, new_path)
print(f"Renamed {old_path} to {new_path}")
return "Success"
except Exception as e:
return f"Failed to rename file: {e}"
# 指定PDF文件路径
pdf_path = 'D:\\Desktop\\code\\陈述事实.pdf'
# 提取信息
order_id, invoice_number, invoice_amount, extract_status = extract_info_from_pdf(pdf_path)
if extract_status == "Success":
# 构建新文件名并重命名文件
new_filename = f"{order_id}-{invoice_number}-{invoice_amount}元.pdf"
rename_status = rename_file(pdf_path, new_filename)
if rename_status == "Success":
print("Overall operation successful.")
else:
print(rename_status)
else:
print(extract_status)