可以帮我把代码改进一下吗?加一个翻页爬取和爬取第11页后面的,网页要验证登入才可以
我要爬取1000页的数据
# -*- coding: utf-8 -*-
from urllib import request
from bs4 import BeautifulSoup
from lxml import etree
import csv
def initUrs():
urls = [
'https://www.jjwxc.net/bookbase.php?fw0=0&fbsj0=0&xx0=0&mainview0=0&sd0=0&lx0=0&fg0=0&bq=-1&sortType=4&isfinish=0&collectiontypes=ors&searchkeywords=&page=12']
return urls
def get(urls):
for url in urls:
try:
data = request.urlopen(url).read().decode('gb18030')
pData = parse(data)
out(pData)
except request.HTTPError as e:
print(f"HTTP error: {e}")
except request.URLError as e:
print(f"URL error: {e}")
except etree.XPathSyntaxError as e:
print(f"XPath syntax error: {e}")
except Exception as e:
print(f"An error occurred: {e}")
def parse(data):
pData = etree.HTML(data)
items = pData.xpath('//tr')[1:]
itemDatas = []
for item in items:
itemData = {}
tds = item.xpath('./td')
texts = tds[0].xpath('./a//text()')
itemData['作者'] = '' if (len(texts) == 0) else texts[0]
texts = tds[1].xpath('./a//text()')
itemData['书名'] = '' if (len(texts) == 0) else texts[0]
texts = tds[2].xpath('.//text()')
itemData['类型'] = '' if len(texts) == 0 else texts[0].strip().replace('\n', '')
# .replace('-', '') 去除 原创-纯爱-近代现代-剧情 中的-
texts = tds[3].xpath('./font//text()')
itemData['进度'] = '' if (len(texts) == 0) else texts[0]
texts = tds[4].xpath('.//text()')
itemData['字数'] = '' if (len(texts) == 0) else texts[0]
texts = tds[5].xpath('.//text()')
itemData['积分'] = '' if (len(texts) == 0) else texts[0]
itemDatas.append(itemData)
return itemDatas
# 输出并且保存为csv文档
def out(data):
print(data)
with open('xiaoshuo3.csv', 'w',encoding='utf-8',newline='') as f:
csvf = csv.DictWriter(f, fieldnames=['作者', '书名', '类型','进度','字数','积分'])
csvf.writeheader()
csvf.writerows(data)
get(initUrs())