使用python爬取有关地铁的评价,程序运行后文档却是空的。
import time
import csv
import codecs
import requests
from pyquery import PyQuery as pq
#请求头
headers = {
'user-agent' :
'referer' :
'cookie' :
}
#Request URL
base_url =
include =
# 获得页面
def get_page(offset):
page_url = 'include=' + include + '&limit=5&' + 'offset=' + str(offset) + '&platform=desktop&sort_by=default'
url = base_url + page_url
try:
response = requests.get(url, headers=headers)
if response.status_code == 200: #请求成功
#print("success")
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
# 时间戳转化为年-月-日 时-分-秒
def TimeStampToTime(timestamp):
timeStruct = time.localtime(timestamp)
return time.strftime('%Y-%m-%d %H:%M:%S', timeStruct)
# 解析网页
def parse_page(json):
if json:
items = json.get('data')
for item in items: # items: 一条报文中的所有回答
zhihu = {}
zhihu['作者'] = item.get('author').get('name')
zhihu['user_token'] = item.get('author').get('url_token')
zhihu['回答'] = pq(item.get('content')).text()
zhihu['创建时间'] = TimeStampToTime(item.get('updated_time'))
zhihu['赞同数'] = item.get('voteup_count')
zhihu['评论数'] = item.get('comment_count')
yield zhihu
if __name__ == '__main__':
i = 0
f = codecs.open('test.csv', 'w+', 'utf_8_sig')
f_txt = open('test.txt', 'w+', encoding='utf_8')
fieldnames = ['作者', 'user_token', '回答', '创建时间', '赞同数', '评论数']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
while True:
js = get_page(i*5) # 根据报文首个回答对应的索引值获取页面
results = parse_page(js)
for res in results:
writer.writerow(res)
for detail in res.values():
f_txt.write(str(detail) + '\n')
f_txt.write('\n' + '*' * 50 + '\n') # 分隔符
if js != None and js.get('paging').get('is_end'):
print('finish!')
break
i += 1
f.close()
f_txt.close()
避免以隐私内容省去了headers和url
是getpage函数有问题还是网页解析有问题,应该如何改进?