刚刚学完bs4想试一下爬取网络小说,但试了好久都是那两个错误,实在找不到什么原因
一个就是输出标签的时候中文乱码
还有一个:
File "", line 27, in <module>
content = div_tag.text
AttributeError: 'NoneType' object has no attribute 'text'
import requests
from bs4 import BeautifulSoup
import lxml
#爬取所有的章节标题和章节内容https://www.xbiquge.la/13/13959/
if __name__ == '__main__':
url = 'https://www.xbiquge.la/13/13959/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
#在首页中解析章节标题和详情页的url
#实例化BeautifulSoup对象
soup = BeautifulSoup(page_text,'lxml')
#print(soup)
#解析章节标题
dd_list = soup.select('dd')
print(dd_list)
fp = open('shengxu.txt','w',encoding='utf-8')
for dd in dd_list:
title = dd.a.string
detail_url = 'https://www.xbiquge.la/' + dd.a['href']
#对详情页发起请求,解析章节内容
detail_data_text = requests.get(url=detail_url,headers=headers).text
#解析出详情页中的内容
detail_soup = BeautifulSoup(detail_data_text,'lxml')
div_tag = detail_soup.find('dic',id='content')
content = div_tag.text
fp.write(title+':'+content+'\n')
print(title,'!!!')