# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
import csv
server = 'http://www.biqukan.com/'
target = 'http://www.biqukan.com/1_1094/'
names = [] # 存放章节名
urls = [] # 存放章节链接
def get_download_urls():
req = requests.get(url=target)
html = req.text
bf = BeautifulSoup(html, 'html.parser')
div = bf.find('div', class_='listmain')
dl = div.find('dl')
dd = dl.find_all('dd')
for each in dd[15:]:
names.append(each.string)
urls.append(server + each.find('a').get('href'))
def get_contents(u):
req = requests.get(url=u)
html = req.text
bf = BeautifulSoup(html, 'html.parser')
texts = bf.find_all('div',{'id': 'content'},class_ = 'showtxt')
if len(texts)>0:
final = texts[0].text.replace('\xa0' * 8, '\n\n')
else:
final=''
return final
def writer( name, path,text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
fieldnames = ['title']
writefile = csv.DictWriter(f, fieldnames=fieldnames)
writefile.writerow({'title': name+'\n'+text})
if __name__ == "__main__":
url=get_download_urls()
for i in range(len(names)):
writer(names[i], '一念永恒.txt', get_contents(urls[i]))
就是 get_contents(u)
函数里为啥有的texts长度会等于0呢,单独爬这一个页面的时候texts是有内容的呀