Python爬取小说 有些章节爬得到有些爬不到 分别爬取都是可以的
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
import csv

server = 'http://www.biqukan.com/'
target = 'http://www.biqukan.com/1_1094/'
names = []  # 存放章节名
urls = []  # 存放章节链接

def get_download_urls():
    req = requests.get(url=target)
    html = req.text
    bf = BeautifulSoup(html, 'html.parser')
    div = bf.find('div', class_='listmain')
    dl = div.find('dl')
    dd = dl.find_all('dd')

    for each in dd[15:]:
        names.append(each.string)
        urls.append(server + each.find('a').get('href'))

def get_contents(u):
    req = requests.get(url=u)
    html = req.text
    bf = BeautifulSoup(html, 'html.parser')
    texts = bf.find_all('div',{'id': 'content'},class_ = 'showtxt')

    if len(texts)>0:
      final = texts[0].text.replace('\xa0' * 8, '\n\n')
    else:
      final=''
    return final

def writer( name, path,text):
    write_flag = True
    with open(path, 'a', encoding='utf-8') as f:
        fieldnames = ['title']
        writefile = csv.DictWriter(f, fieldnames=fieldnames)
        writefile.writerow({'title': name+'\n'+text})

if __name__ == "__main__":
  url=get_download_urls()
  for i in range(len(names)):
        writer(names[i], '一念永恒.txt',  get_contents(urls[i]))

就是 get_contents(u)

函数里为啥有的texts长度会等于0呢,单独爬这一个页面的时候texts是有内容的呀

2个回答

←如果以下回答对你有帮助,请点击右边的向上箭头及采纳下答案

修改下get_contents函数,打印下print(u,req,html),检测是哪里错了,一般会因为有反爬,建议可以加下请求头和延时

def get_contents(u):
    req = requests.get(url=u)
    html = req.text
    bf = BeautifulSoup(html, 'html.parser')
    texts = bf.find_all('div',{'id': 'content'},class_ = 'showtxt')

    if len(texts)>0:
      final = texts[0].text.replace('\xa0' * 8, '\n\n')
    else:
      print(u,req,html)
      final=''
    return final

texts长度会等于0是因为

        urls.append(server + each.find('a').get('href'))

这里是 网站+网站 ,其实你已经爬到了最终的网址了,不用加server了,去掉server就行了

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!