import re
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
import openpyxl
提取PDF中的文本信息
with open('发票.pdf', 'rb') as file:
text = extract_text(file, laparams=LAParams(line_margin=1))
定义正则表达式
regex = r"编号:(.*?)(?:\n|$)"
xinyongcodes = re.findall(regex,text)
创建一个新的Excel工作簿
workbook = openpyxl.Workbook()
sheet = workbook.active
写入数据到Excel
row_num = 1
for code in xinyongcodes:
if code.strip():
sheet.cell(row=row_num, column=1, value=code.strip())
row_num += 1
保存Excel文件
workbook.save(filename='luomiqi1.xlsx')