微博签到数据爬取,请问一下各位,为什么爬取微博签到页面返回的数据是重复的,而且有时有数据有时没数据?
import requests
import json
import jsonpath
import pprint
import re
import datetime
import csv
datas=[]
for pagenum in range(2,50):
url='https://m.weibo.cn/api/container/getIndex?containerid=1008087e040aa9cb2ec494b0a4d52c147e682c_-_lbs&lcardid=frompoi&extparam=frompoi&luicode=10000011&lfid=100103type=1&q=广州'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'}
parme={'since_id': pagenum}
response=requests.get(url=url,headers=headers,params=parme).json()
for i in range(16):
id =response['data']['cards'][0]['card_group'][i]['mblog']['user']['id']
b=response['data']['cards'][0]['card_group'][i]['mblog']['created_at']
GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'
timeArray = datetime.datetime.strptime(b, GMT_FORMAT)
time = timeArray.strftime("%Y-%m-%d %H:%M:%S")
c=response['data']['cards'][0]['card_group'][i]['mblog']['text']
d =response['data']['cards'][0]['card_group'][i]['scheme']
if '全文' in c:
e = re.findall(r'[^\/][\w]+(?=\?)', d)[0]
url1 = 'https://m.weibo.cn/statuses/extend?id=' + e
text = requests.get(url=url1, headers=headers).json()
content = text['data']['longTextContent']
address1 = re.findall(r'</span><span class="surl-text">(.+?)</span>', content)
else:
content=c
address1=re.findall(r'</span><span class="surl-text">(.+?)</span>', c)
datas.append(['id',id, '时间', time, '文本', content,'地点',address1])
with open("paquweiboqiandao.csv", mode='a',errors='ignore') as f:
csvwriter = csv.writer(f)
csvwriter.writerows(datas)
python爬虫