http://www.moe.gov.cn/jyb_xxgk/moe_1777/moe_1778/202003/t20200326_435127.html
获取能获取到所有b标签里面的内容,但是就是不知道怎么添加到程序里面。就差获取标题里面的二级标题了,主要就是//p标签下面的//b标签的内容,不知如何添加到程序,输出的时候能和原文一样,不错位。
import requests
from lxml import etree
url = 'http://www.moe.gov.cn/jyb_xxgk/moe_1777/moe_1778/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0\
.2743.116 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
response = requests.get(url, headers=headers).text
html = etree.HTML(response)
result1 = html.xpath('//ul[@id="list"]//li//a/@href')
for site in result1:
xurl = "http://www.moe.gov.cn/jyb_xxgk/moe_1777/moe_1778/" + site
req = requests.get(xurl, headers=headers)
html2 = etree.HTML(req.content)
result2 = html2.xpath('//p/text()')#获取p标签内容
result3=html2.xpath('//h1/text() ')#通知标题
s=result3+result2
fname = r"C:\Users\Administrator\Desktop\1234.docx"
with open(fname, 'ab') as fp:
for i in s:
fp.write(i.encode('utf-8'))
fp.write(b'\r\n')