问题遇到的现象和发生背景
智能化办公提取发票信息,网上查询到了一个很好的案例,附上地址
https://zhuanlan.zhihu.com/p/368712419
但是在实现的过程中,有两份PDF是邮票,整个背景变黑了,利用ocr识别不出上面的文字,应该怎么办?
运行结果及报错内容
原本的PDF
转成JPEG后的图片
有没有人可以指导一下
代码附上
from wand.image import Image
from PIL import Image as PI
import pyocr.builders
import io
import os
import shutil
from glob import glob
import openpyxl as op
#读取所有PDF文件
paths = glob(r'C:\Users\Administrator\Desktop\test*.pdf')
#print(paths)
#获取配置好的 tesseract 便于后面调用:
tool = pyocr.get_available_tools()[0]
txt1_lis = []
txt2_lis = []
txt3_lis = []
txt4_lis = []
txt5_lis = []
for path in paths:
# #通过 wand 模块将 PDF 文件转化为分辨率为 300 的 jpeg 图片形式:
image_pdf = Image(filename=path, resolution=300)
image_jepg = image_pdf.convert('jepg')
## print(image_jpeg)
#
## 将图片解析为二进制矩阵:
image_lst = []
for img in image_jepg.sequence:
img_page = Image(image=img)
image_lst.append(img_page.make_blob('jpeg'))
#
# #用 io 模块的 BytesIO 方法读取二进制内容为图片形式:
new_img = PI.open(io.BytesIO(image_lst[0]))
## new_img.show()
#
# #解析发票代码
left = 1958
top = 50
right = 2500
bottom = 150
image_obj1 = new_img.crop((left, top, right, bottom))
## image_obj1.show()
txt1 = tool.image_to_string(image_obj1)
txt1_lis.append(txt1)
## print(txt1)
## 解析发票号码
left = 1958
top = 150
right = 2500
bottom = 200
image_obj2 = new_img.crop((left, top, right, bottom))
# #image_obj2.show()
txt2 = tool.image_to_string(image_obj2)
txt2_lis.append(txt2)
## print(txt2)
## 解析开票日期
left = 1958
top = 200
right = 2500
bottom = 250
image_obj3 = new_img.crop((left, top, right, bottom))
## image_obj3.show()
txt3 = tool.image_to_string(image_obj3, lang='chi_sim')
txt3_lis.append(txt3)
## print(txt3)
# #解析检验码
left = 1958
top = 260
right = 2500
bottom = 320
image_obj4 = new_img.crop((left, top, right, bottom))
# #image_obj4.show()
txt4 = tool.image_to_string(image_obj4)
txt4 = txt4.replace(" ", "") # 去除空格
txt4 = txt4[-6:] # 提取后六位
txt4_lis.append(txt4)
# print(txt4)
# #解析金额
left = 2120
top = 1150
right = 2300
bottom = 1200
image_obj5 = new_img.crop((left, top, right, bottom))
## image_obj5.show()
txt5 = tool.image_to_string(image_obj5)
txt5_lis.append(txt5)
## print(txt5)
file = r"C:\Users\Administrator\Desktop\模版(2021版).xlsx"
def write():
bg = op.load_workbook(file) # 应先将excel文件放入到工作目录下
sheet = bg["机打发票"] # “Sheet1”表示将数据写入到excel文件的sheet1下
for i in range(1, len(txt1_lis) + 1):
sheet.cell(i + 1, 3, txt1_lis[i - 1]) # sheet.cell(1,1,num_list[0])表示将num_list列表的第0个数据1写入到excel表格的第一行第一列
sheet.cell(i + 1, 4, txt2_lis[i - 1])
sheet.cell(i + 1, 5, txt3_lis[i - 1])
sheet.cell(i + 1, 6, txt4_lis[i - 1])
sheet.cell(i + 1, 11, txt5_lis[i - 1])
bg.save(file) # 对文件进行保存
write()