问题遇到的现象和发生背景
使用BeatyfulSoup爬取网页文本后,将获得的对象文本通过DOCUMENT.add_paragraph添加到word保存报错ValueError,将对象编码utf-8又报错Typeerror
问题相关代码,请勿粘贴截图
import requests
from bs4 import BeautifulSoup
from docx import Document
def find_txt(html):
page = BeautifulSoup(html, "html.parser")
content = page.find("div", attrs={"style":"border:1px solid #C8DBD3;padding:20px;line-height:24px;"})
print(content.text)
return content.text
def save_file(lst):
document = Document()
document.add_paragraph(lst.encode('utf-8'))
document.save('html.docx')
def main(url, headers):
response = requests.get(url, headers=headers)
html = (response.text.replace("
", "")).replace("
", "")
final_result = find_txt(html)
save_file(final_result)
if name == "main":
url = "https://wenku.baidu.com/view/1617b18dae1ffc4ffe4733687e21af45b207fe47.html%22
headers ={'User-agent':'Googlebot'}
main(url, headers)
运行结果及报错内容
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
TypeError: 'in ' requires string as left operand, not int