Python转换发票PDF文件为JPEG，但是里面有几个邮票PDF转换的时候图片是黑的

问题遇到的现象和发生背景

智能化办公提取发票信息，网上查询到了一个很好的案例，附上地址
https://zhuanlan.zhihu.com/p/368712419
但是在实现的过程中，有两份PDF是邮票，整个背景变黑了，利用ocr识别不出上面的文字，应该怎么办？

运行结果及报错内容

原本的PDF

转成JPEG后的图片

有没有人可以指导一下

代码附上

from wand.image import Image
from PIL import Image as PI
import pyocr.builders
import io
import os
import shutil
from glob import glob
import openpyxl as op

#读取所有PDF文件

paths = glob(r'C:\Users\Administrator\Desktop\test*.pdf')

#print(paths)

#获取配置好的 tesseract 便于后面调用：

tool = pyocr.get_available_tools()[0]

txt1_lis = []
txt2_lis = []
txt3_lis = []
txt4_lis = []
txt5_lis = []

for path in paths:
# #通过 wand 模块将 PDF 文件转化为分辨率为 300 的 jpeg 图片形式：
image_pdf = Image(filename=path, resolution=300)
image_jepg = image_pdf.convert('jepg')
## print(image_jpeg)
#
## 将图片解析为二进制矩阵：
image_lst = []
for img in image_jepg.sequence:
img_page = Image(image=img)
image_lst.append(img_page.make_blob('jpeg'))
#
# #用 io 模块的 BytesIO 方法读取二进制内容为图片形式：
new_img = PI.open(io.BytesIO(image_lst[0]))
## new_img.show()
#
# #解析发票代码
left = 1958
top = 50
right = 2500
bottom = 150
image_obj1 = new_img.crop((left, top, right, bottom))
## image_obj1.show()
txt1 = tool.image_to_string(image_obj1)
txt1_lis.append(txt1)
## print(txt1)

## 解析发票号码
left = 1958
top = 150
right = 2500
bottom = 200
image_obj2 = new_img.crop((left, top, right, bottom))
# #image_obj2.show()
txt2 = tool.image_to_string(image_obj2)
txt2_lis.append(txt2)
## print(txt2)

## 解析开票日期
left = 1958
top = 200
right = 2500
bottom = 250
image_obj3 = new_img.crop((left, top, right, bottom))
## image_obj3.show()
txt3 = tool.image_to_string(image_obj3, lang='chi_sim')
txt3_lis.append(txt3)
## print(txt3)

# #解析检验码
left = 1958
top = 260
right = 2500
bottom = 320
image_obj4 = new_img.crop((left, top, right, bottom))
# #image_obj4.show()
txt4 = tool.image_to_string(image_obj4)
txt4 = txt4.replace(" ", "")  # 去除空格
txt4 = txt4[-6:]  # 提取后六位
txt4_lis.append(txt4)
# print(txt4)

# #解析金额
left = 2120
top = 1150
right = 2300
bottom = 1200
image_obj5 = new_img.crop((left, top, right, bottom))
## image_obj5.show()
txt5 = tool.image_to_string(image_obj5)
txt5_lis.append(txt5)
## print(txt5)

file = r"C:\Users\Administrator\Desktop\模版（2021版）.xlsx"

def write():
bg = op.load_workbook(file) # 应先将excel文件放入到工作目录下
sheet = bg["机打发票"] # “Sheet1”表示将数据写入到excel文件的sheet1下
for i in range(1, len(txt1_lis) + 1):
sheet.cell(i + 1, 3, txt1_lis[i - 1]) # sheet.cell(1,1,num_list[0])表示将num_list列表的第0个数据1写入到excel表格的第一行第一列
sheet.cell(i + 1, 4, txt2_lis[i - 1])
sheet.cell(i + 1, 5, txt3_lis[i - 1])
sheet.cell(i + 1, 6, txt4_lis[i - 1])
sheet.cell(i + 1, 11, txt5_lis[i - 1])
bg.save(file) # 对文件进行保存

write()

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

3条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
溪风沐雪 2022-05-07 17:00
关注
看起来好复杂，但是看图片我感觉有可能是PDF原本的图片是png格式，背景透明，你看看参考这个思路有没有可能找出问题所在
image_png = image_pdf.convert('png')，这个地方不要用png，用jpg试试

解决
无用 1
评论打赏
分享
举报编辑记录

评论

按下Enter换行，Ctrl+Enter发表内容