很简单的爬取一个小说,但是编码遇到报错,gbk,utf-8都不行。
-*- coding: utf-8 -*-
import urllib.request
import re
import sys
import os
import urllib
from bs4 import BeautifulSoup
from urllib import request
#根据给定的网址来获取网页详细信息,得到的html就是网页的源代码
def getHtml(weburl):
webheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
webheaders={
'Referer':'http://www.biqukan.cc/book/20461/12592815.html',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
req = urllib.request.Request(url=weburl, headers=webheaders)
page = urllib.request.urlopen(req)
html = page.read()
return html.decode('gbk')
def gettext(html):
soup = BeautifulSoup(html, "lxml")
content = soup.find(class_='panel-body',id='htmlContent')
txt=content.get_text()
with open('D:\\test.txt','a') as f:
f.write(txt)
weburl="http://www.biqukan.cc/book/20461/12592815.html"
html=getHtml(weburl)#获取该网址网页详细信息,得到的html就是网页的源代码
gettext(html)
错误信息:
UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position 75: il
legal multibyte sequence
还有:UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 116: invali
d start byte