这段代码是网上抄来的,作用是下载文献后翻译,再将标题摘要结论拼接在一起你。求解答!
在TranslateFile()处进入循环后退出
import os
import shutil
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
from shutil import copy2
import hashlib
import requests
import uuid
import time
import json
import importlib,sys
importlib.reload(sys)
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfpage import PDFPage
import re
import time
class YouDaoFanyi:
def __init__(self, appKey, appSecret):
self.YOUDAO_URL = 'https://openapi.youdao.com/api/'
self.APP_KEY = appKey # 应用id
self.APP_SECRET = appSecret # 应用密钥
self.langFrom = 'auto' # 翻译前文字语言,auto为自动检查
self.langTo = 'zh-CHS' # 翻译后文字语言,auto为自动检查
self.vocabId = "您的用户词表ID" #非必填项,可以不写
def encrypt(self,signStr):
hash_algorithm = hashlib.sha256()
hash_algorithm.update(signStr.encode('utf-8'))
return hash_algorithm.hexdigest()
def truncate(self,q):
if q is None:
return None
size = len(q)
return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
def do_request(self,data):
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
return requests.post(self.YOUDAO_URL, data=data, headers=headers)
def translate(self,q):
data = {}
data['from'] = self.langFrom
data['to'] = self.langTo
data['signType'] = 'v3'
curtime = str(int(time.time()))
data['curtime'] = curtime
salt = str(uuid.uuid1())
signStr = self.APP_KEY + self.truncate(q) + salt + curtime + self.APP_SECRET
sign = self.encrypt(signStr)
data['appKey'] = self.APP_KEY
data['q'] = q
data['salt'] = salt
data['sign'] = sign
data['vocabId'] = self.vocabId
response = self.do_request(data)
contentType = response.headers['Content-Type']
result = json.loads(response.content.decode('utf-8'))['translation'][0]
print(result)
return result
#对象文件的类型指定
file_type_list = ['pdf']
#关键字分类
keyword_list = ['method', 'CFD','装置','波浪能','方法','WEC']
#copy源所在目录
src_folder = r'.\src_dir' #路径最后不要加\ (填入对应的copy源路径)
#copy到的指定目录
dst_folder = r'.\dst_dir' #路径最后不要加\ (填入对应的copy to路径)
#取得文件夹下面的所有指定类型的文件全名(路径+文件名)
def get_file_list(folder):
filelist = [] #存储要copy的文件全名
for dirpath,dirnames,filenames in os.walk(folder):
for file in filenames:
file_type = file.split('.')[-1]
if(file_type in file_type_list):
file_fullname = os.path.join(dirpath, file) #文件全名
filelist.append(file_fullname)
return filelist
def RenameCopyTo(fileName, path, pdf_info):
os.makedirs(os.path.dirname(path), exist_ok=True)
new_file_type = fileName.split('.')[-1]
new_file_author ='noAuthor'
new_file_title = 'noTitle'
new_file_date='noDate'
if '/Author' in pdf_info:
new_file_author = pdf_info['/Author']
if '/CreationDate' in pdf_info:
new_file_date = pdf_info['/CreationDate']
if '/Title' in pdf_info:
new_file_title = pdf_info['/Title']
new_file_name= new_file_author+"_"+new_file_title+"_"+new_file_date
illegal_char_list = ['?',' ','*','<','>','|','/','\\',':','\'','+',',','"']
for char in illegal_char_list:
new_file_name = new_file_name.replace(char,'_')
shutil.copy(fileName, os.path.join(path,new_file_name[:250]+"."+new_file_type))
def RenameCopyByKeywords():
#取得文件夹下所有指定类型的文件全名
filelist = get_file_list(src_folder)
for file in filelist:
try:
pdf_reader = PdfFileReader(open(file, 'rb'))
if pdf_reader.isEncrypted:
pdf_reader.decrypt('')
pdf_info = pdf_reader.getDocumentInfo()
if '/Title' in pdf_info and '/Keywords' in pdf_info:
paper_title = pdf_info['/Title'] # 获取PDF标题
paper_keywords = pdf_info['/Keywords']
contain_count = 0
if paper_title:
for keyword in keyword_list:
if keyword in paper_keywords:
RenameCopyTo(file, dst_folder+"/"+keyword+"/", pdf_info)
else:
contain_count += 1
if contain_count == len(keyword_list) :
RenameCopyTo(file, dst_folder+'/no_keyword/', pdf_info)
tmp_pk = []
if (',' in paper_keywords):
for pk in paper_keywords.split(','):
tmp_pk.append(pk.strip())
if (';' in paper_keywords):
for pk in paper_keywords.split(';'):
tmp_pk.append(pk.strip())
tmp_tmp_tk = []
for k in keyword_list:
for tk in tmp_pk:
if k.lower() in tk.lower() and tk!='' and k!='':
tmp_tmp_tk.append(k)
if (len(tmp_tmp_tk)>1):
# print(dst_folder+'_'.join(tmp_tmp_tk) +'/')
RenameCopyTo(file, dst_folder+'/'+'_'.join(tmp_tmp_tk) +'/', pdf_info)
else:
RenameCopyTo(file, dst_folder+'/no_title/', pdf_info)
else:
# print(file)
# print(pdf_info)
RenameCopyTo(file, dst_folder+'/meta_data_error/', pdf_info)
except Exception as e:
RenameCopyTo(file, dst_folder+'/file_error/', pdf_info)
def generate_author(author):
# 过滤掉作者名后面的各种符号,并生成引用的格式
# print(author)
author = re.sub('by |[\s\d\*∗\/@†\(\&\)]+$', '', author)
author_list = re.split('\s+',author)
author_str = author_list[len(author_list)-1]
for i in range(0,len(author_list)-1):
author_str = author_str + ' ' + author_list[i][0]
return author_str
def pdfParse(DataIO, save_path, appKey, appSecret):
#用文件对象创建一个PDF文档分析器
parser = PDFParser(DataIO)
#创建一个PDF文档
doc = PDFDocument(parser)
#分析器和文档相互连接
# parser.set_document(doc)
# doc.set_parser(parser)
#提供初始化密码,没有默认为空
# doc.initialize()
#检查文档是否可以转成TXT,如果不可以就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
#创建PDF资源管理器,来管理共享资源
rsrcmagr = PDFResourceManager()
#创建一个PDF设备对象
laparams = LAParams()
#将资源管理器和设备对象聚合
device = PDFPageAggregator(rsrcmagr, laparams=laparams)
#创建一个PDF解释器对象
interpreter = PDFPageInterpreter(rsrcmagr, device)
last_para = '' # 记录上一段文本
count = 0 # 对文本块进行计数,方便后续查找标题和作者
author = '' # 记录作者
ab_count = 0 # 记录已识别的摘要的数量,避免提取文中的abstract
fanyi = YouDaoFanyi(appKey, appSecret)
#循环遍历列表,每次处理一个page内容
#doc.get_pages()获取page列表
# for page in PDFPage.get_pages(doc):
try:
for page in enumerate(PDFPage.create_pages(doc)):
interpreter.process_page(page[1])
#接收该页面的LTPage对象
layout = device.get_result()
#这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
#一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对像
#想要获取文本就得获取对象的text属性
try:
for x in layout:
try:
if(isinstance(x, LTTextBoxHorizontal)):
with open('%s' % (save_path), 'a', encoding='utf-8') as f:
result = x.get_text() # 每块的内容
# 提取标题
if count==0:
# 如果是researchgate的文章,直接翻页
if re.findall('^see discussions', result.lower())!=[]:
break
# 如果第一行是各种页眉等干扰信息,直接略过
if re.findall('(^[0-9])|(^(research )?article)|(unclassified)|(www.)|(accepted (from|manuscript))|(proceedings of)|(vol.)|(volume \d)|(https?://)|(^ieee)|(sciencedirect)|(\d{4}\)$)|(\d{1,4} – \d{1,4}$)|(cid:)',re.split('\s+$',result.lower())[0])!=[] or '':
count -= 1
else:
# 将结果写入TXT
f.write('\n'+result.replace('\n', '')+'\n')
# 提取作者
elif count==1:
# 只取第一作者
author = result.split('\n')[0].split(',')[0].split(' and ')[0]
author = generate_author(author)
print('author '+ author)
# 去掉pdf文件读取的各种换行符
result = result.replace('\n', '')
try:
# 转为小写,去掉空格,方便正则识别
last_para = last_para.lower().replace(' ', '')
# print(result)
# 匹配Abstract和摘要内容分开的情况
if re.findall('abstract$', last_para)!=[]:
# 去掉关键词
oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',result)[0]
# 翻译并转换人称
trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
# print(result)
# 组织语言写入TXT
write_cont = author + '等人提出:' + trans_result + '\n'
ab_count += 1
f.write(write_cont)
# 匹配Abstract和摘要内容位于同一行的情况
elif re.findall('^abstract', result.lower().replace(' ', ''))!=[] and re.findall('abstract$', result.lower().replace(' ', ''))==[]:
# 确保摘要只匹配一次,不匹配文中的Abstract字眼
if ab_count==0:
# 去掉Abstract字眼及其后续的符号
oringin_result = re.sub('(a|A)(bstract|BSTRACT)[- —.]?','', result)
# 去掉关键词
oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',oringin_result)[0]
# 翻译并转换人称
trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
# print(result)
# 组织语言写入TXT
write_cont = author + '等人提出:' + trans_result + '\n'
ab_count += 1
f.write(write_cont)
# 匹配结论
elif re.findall('(^(i|v|x|\d)*\.?conclusions?)|(conclusions?$)', last_para)!=[]:
# 避免因图表在标题下方导致的识别错误
if re.findall('^fig', result.lower()):
continue
# 翻译
trans_result = fanyi.translate(result)
# print(result)
# 转换人称
write_cont = trans_result.replace('我们', '他们') + '\n'
# 写入TXT
f.write(write_cont)
except Exception as e:
print(e)
last_para = result
count += 1
except Exception as e:
print('out'+str(e))
except Exception as e:
print(e)
except Exception as e:
print(e)
with open('%s' % (save_path), 'a', encoding='utf-8') as f:
f.write('\n')
def TranslateFile():
appKey = '066d0d482744283c' # 应用id
appSecret = 'ay0LLLVb3eKsCasiLxqze4Zh9iBqBUQU' # 应用密钥
success_count = 0 # 统计成功的次数
fail_count = 0 #统计失败的次数
#单次调用,供开发测试
# pdf_filename =r'.\dst_dir\\meta_data_error\\noAuthor_noTitle_D_20091110181132-05_00_.pdf'
# with open(pdf_filename,'rb') as pdf_html:
# try:
# pdfParse(pdf_html, r'.\dst_dir\\meta_data_error\\noAuthor_noTitle_D_20091110181132-05_00_.txt', appKey, appSecret)
# success_count+=1
# except Exception as e:
# print(pdf_filename)
# print(e)
# fail_count+=1
for root,dirs,files in os.walk(dst_folder):
for dir in dirs:
#获取目录的名称
folder_path = os.path.join(root,dir)
for r, ds, fs in os.walk(folder_path):
for f in fs:
if f.split('.')[-1] in file_type_list:
with open(r+'\\'+f,'rb') as pdf_html:
try:
print(f)
pdfParse(pdf_html, r +'\\' + f.split('.')[0]+'.txt', appKey, appSecret)
success_count+=1
except Exception as e:
# 文件读取或翻译失败则将错误信息写入TXT
print('文档读取失败:' + str(e) +',路径为:' + r+f)
with open('%s' % ('./translate_error_files.txt'), 'a', encoding='utf-8') as ffff:
ffff.write('\n'+r +'\\' +f + '\n')
fail_count+=1
# 主函数
if __name__=="__main__":
# RenameCopyByKeywords()
TranslateFile()
###### 好像跟我设置的路径有误有关系