import requests
import re
import os
html = requests.get('https://www.kanunu8.com/book/5874/').content.decode('GB2312')
def get_toc(html):
toc_url_list = []
toc_block = re.findall('正文(.*?)</tbody>', html, re.S)[0]
toc_url = re.findall('href="(.*?)"', toc_block, re.S)
for url in toc_url:
toc_url_list.append('https://www.kanunu8.com/book/5874/'+url)
return toc_url_list
httls = get_toc(html)
for httl in httls:
httl = requests.get(httl).content.decode('GBK')
def get_article(httl):
chapter_name = re.search('size="4">(.*?)<', httl, re.S).group(1)
text_block = re.search('<p>(.*?)</p>', httl, re.S).group(1)
text_block = text_block.replace('<br />', '')
return chapter_name, text_block
chapter_name = get_article(httl)
text_block = get_article(httl)
def save(chapter_name, text_block):
os.makedirs('风雨燕归来', exist_ok=True)
file_path = os.path.join('风雨燕归来', 'chapter_name.txt')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(text_block)
python爬取小说十章,程序可以运行,但什么都没有跑出来,想请教下代码哪里出错了?(电脑是Mac)
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
2条回答 默认 最新
CSDN专家-showbo 2021-09-08 08:55关注改成下面的就行,有帮助麻烦点个采纳【本回答右上角】,谢谢~~
你的问题主要在于获取内容的代码未放如for循环中,save也未调用,还有一些小问题看注释
import requests import re import os def get_toc(html): toc_url_list = [] toc_block = re.findall('正文(.*?)</tbody>', html, re.S)[0] toc_url = re.findall('href="(.*?)"', toc_block, re.S) for url in toc_url: toc_url_list.append('https://www.kanunu8.com/book/5874/'+url) return toc_url_list def get_article(httl): chapter_name = re.search('size="4">(.*?)<', httl, re.S).group(1) text_block = re.search('<p>(.*?)</p>', httl, re.S).group(1) text_block = text_block.replace('<br />', '') return chapter_name,text_block #下面这2局放到上面的for..in里面,而且for...in调整下位置 #chapter_name = get_article(httl) #text_block = get_article(httl) def save(chapter_name, text_block): os.makedirs('风雨燕归来', exist_ok=True) file_path = os.path.join('风雨燕归来', chapter_name+'.txt')#这里文件名直接写死了,未使用传入的参数 with open(file_path, 'w', encoding='utf-8') as f: f.write(text_block) html = requests.get('https://www.kanunu8.com/book/5874/').content.decode('GB2312') httls = get_toc(html) for httl in httls: httl = requests.get(httl).content.decode('GBK') chapter_name,text_block = get_article(httl)#只需要调用一次分别获取对应的值就行 save(chapter_name,text_block)#==========调用保存本回答被题主选为最佳回答 , 对您是否有帮助呢?评论 打赏 举报解决 1无用