需求是:把页面上的内容保存到word中,页面上有文字和图片。现在只实现了文字和图片分开保存,文字在前,图片在后,希望能实现按照页面上的顺序,文字和图片穿插保存。
我想在读取到内容后判断一下是文字还是图片,然后分别执行不同的代码进行保存,尝试了很久也没找到方法。希望能实现按照页面上的顺序保存到本地word中。
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches,Pt,RGBColor
from docx.oxml.ns import qn
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}
file = Document()
file.styles['Normal'].font.name = u'宋体'
file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
file.styles['Normal'].font.size = Pt(12)
file.styles['Normal'].font.color.rgb = RGBColor(0,0,0)
url = 'https://bbs.tiexue.net/post_7023745_1.html'
title = '标题'
strhtml = requests.get(url,headers=headers,timeout=(4,3))
soup = BeautifulSoup(strhtml.text,'html.parser')
data = soup.select('#postContent > p[class="bbsp"]')
pic = soup.select('#postContent > p[class="bbsp"] > a > img')
file.add_paragraph(url)
file.add_paragraph(title)
for item1 in data:
result1 = {
'paragraph':item1.get_text()
}
file.add_paragraph(result1['paragraph'])
for item2 in pic:
result2 = {
'pic':item2.get('src')
}
pic = requests.get(result2['pic'],headers=headers,timeout=(4,3))
with open('pic_tmp.png',"wb")as f:
f.write(pic.content)
file.add_picture('pic_tmp.png', width=Inches(6))
docxurl = title+'.docx'
file.save(docxurl)