import re
import requests
import time
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
info_lists=[]
f = open('C:/Users/xyh/Desktop/baike.txt', 'a+')#创文件
def judgment_sex(class_name):#判别性别
if class_name =='womenIcon':
return '女'
else:
return '男'
def get_info(url):#详细信息
res=requests.get(url)
ids=re.findall('<h2>(.*?)<h2>',res.text,re.S)
levels=re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text,re.S)
sexs=re.findall('<div class="articleGender (.*?)">',res.text,re.S)
contents=re.findall('<div classs="content">.*?<span>(.*?)</span>',res.text,re.S)
laughs=re.findall('<span class="stats-vote"><i class=""number>(\d+)</i>',res.text,re.S)
comments=re.findall('<span class="stats-comments"><i class=""number>(\d+)</i>',res.text,re.S)
for id,level,sex,content,laugh,comment in zip(ids,levels,sexs,contents,laughs,comments,):
info={
'id':id,
'level':level,
'sex':judgment_sex(sex),
'content':content,
'laugh':laugh,
'comment':comment
}
info_lists.append(info)
if __name__ == '__main__':
urls=['https://www.qiushibaike.com/text/page/{}/'.format(number) for
number in range(1,11)]
for url in urls:
get_info(url)
time.sleep(1)
for info_list in info_lists:#写入到文件
try:
f.write(info_list['id']+'\n')
f.write(info_list['level']+'\n')
f.write(info_list['sex']+'\n')
f.write(info_list['content']+'\n')
f.write(info_list['laugh']+'\n')
f.write(info_list['comment']+'\n')
f.close()
except UnicodeError:
pass爬取糗事百科段子文字,爬取不到信息,求各位老板指点,谢谢
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
2条回答 默认 最新
CSDN专家-江小黑 2021-04-12 11:48关注ids=re.findall('<h2>(.*?)<h2>',res.text,re.S) levels=re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text,re.S) sexs=re.findall('<div class="articleGender (.*?)">',res.text,re.S) contents=re.findall('<div classs="content">.*?<span>(.*?)</span>',res.text,re.S) laughs=re.findall('<span class="stats-vote"><i class=""number>(\d+)</i>',res.text,re.S) comments=re.findall('<span class="stats-comments"><i class=""number>(\d+)</i>',res.text,re.S)打印了下这几个变量都有内容了,你是后面的处理逻辑有问题。
本回答被题主选为最佳回答 , 对您是否有帮助呢?评论 打赏 举报解决 1无用