pycharm报错An error occurred: 'gb18030' codec can't decode byte 0x8b in position 1: illegal multibyte sequence
# -*- coding: utf-8 -*-
from urllib import request
from lxml import etree
import csv
def initUrs():
urls = ['https://www.jjwxc.net/bookbase.php?fw0=0&fbsj0=0&xx0=0&mainview0=0&sd0=0&lx0=0&fg0=0&bq=-1&sortType=4&isfinish=0&collectiontypes=ors&searchkeywords=&page=3']
return urls
def get(urls):
for url in urls:
try:
data = request.urlopen(url).read().decode('gb18030')
pData = parse(data)
out(pData)
except request.HTTPError as e:
print(f"HTTP error: {e}")
except request.URLError as e:
print(f"URL error: {e}")
except etree.XPathSyntaxError as e:
print(f"XPath syntax error: {e}")
except Exception as e:
print(f"An error occurred: {e}")
def parse(data):
pData = etree.HTML(data)
items = pData.xpath('//tr')[1:]
itemDatas = []
for item in items:
itemData = {}
tds = item.xpath('./td')
texts = tds[0].xpath('./a//text()')
itemData['作者'] = '' if(len(texts) == 0) else texts[0]
texts = tds[1].xpath('./a//text()')
itemData['书名'] = '' if(len(texts) == 0) else texts[0]
texts = tds[2].xpath('.//text()')
itemData['类型'] = '' if len(texts) == 0 else texts[0].strip().replace('\n', '')
# .replace('-', '') 去除 原创-纯爱-近代现代-剧情 中的-
texts = tds[3].xpath('./font//text()')
itemData['进度'] = '' if(len(texts) == 0) else texts[0]
texts = tds[4].xpath('.//text()')
itemData['字数'] = '' if(len(texts) == 0) else texts[0]
texts = tds[5].xpath('.//text()')
itemData['积分'] = '' if(len(texts) == 0) else texts[0]
itemDatas.append(itemData)
return itemDatas
# 输出并且保存为csv文档
def out(data):
print(data)
with open('xiaoshuo3.csv', 'w',encoding='gb18030',newline='') as f:
csvf = csv.DictWriter(f, fieldnames=['作者', '书名', '类型','进度','字数','积分'])
csvf.writeheader()
csvf.writerows(data)
get(initUrs())
怎么解决