import requests
from bs4 import BeautifulSoup
url = "https://www.shicimingju.com/book/sanguoyanyi.html"
dicc = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
a = requests.get(url, headers=dicc)
a.encoding = 'utf-8'
b = BeautifulSoup(a.text, 'lxml')
c =b.select('.book-mulu > ul >li > a')
print(c[0:3])
fp = open('./sanguo.txt','w')
for i in c[0:2]:
title = i.string
link = "https://www.shicimingju.com"+i['href']
new_a = requests.get(link, headers=dicc)
new_a.encoding = 'utf-8'
new_b = BeautifulSoup(new_a.text, 'lxml')
txt = new_b.find('div',"chapter_content").text
fp.write(title+":"+txt+"\n")
print(title ,'done!')
fp.close()
返回:
[第一回·宴桃园豪杰三结义 斩黄巾英雄首立功, 第二回·张翼德怒鞭督邮 何国舅谋诛宦竖, 第三回·议温明董卓叱丁原 馈金珠李肃说吕布]
第一回·宴桃园豪杰三结义 斩黄巾英雄首立功 done!
第二回·张翼德怒鞭督邮 何国舅谋诛宦竖 done!
为什么老是少一个项?该四个却返回三个,该三个却返回两个?
当然,其实全爬是没问题的,反正可以for循环,但是为什么少这一个?