问题遇到的现象和发生背景
通过 nextUrl = json_data['paging']['next'] 返回的数据
https://www.zhihu.com/topics/20760895/feeds/essence_v4?offset=20&limit=10&include=data
点击里面没有数据
网页中实际的url
https://www.zhihu.com/api/v4/topics/20760895/feeds/essence?offset=20&limit=10&include=data
点击有数据 ,你会发现两个网址不一样
如果 nextUrl = json_data['paging']['next'] 定位执行正确,应该返回的是第二个地址,但是实际上返回了第一个地址。
也就是定位出错,但是按照json定位应该是没有问题的
问题相关代码,请勿粘贴截图
# TODO 鸟欲高飞,必先展翅
# TODO 向前的人 :Jhon
import pymysql # 请求库
import requests # 解析库
import json # 数据保存库
import threading # 多线程,提升爬取速度
# todo 连接数据库
db = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="zhihu")
# todo 创建游标对象
cursor = db.cursor()
# todo 创建数据库操作函数
def insertDB(id, title, url):
try:
# 插入语句
sql = "insert into questions values(%s,'%s','%s')" % (id, title, url)
cursor.execute(sql)
db.commit() # 提交 提交了才能成功
except Exception as e:
db.rollback() # 错误了就回调
print(e)
def saveArticleDB(id, title, vote, cmts, auth, url):
# 插入语句
try:
sql = "insert into article values(%s,'%s','%s','%s','%s','%s')" % (id, title, vote, cmts, auth, url)
cursor.execute(sql)
db.commit()
except Exception as e:
db.rollback()
print(e)
def index_web_page(url):
# 发送网络请求,获取数据
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36",
'cookie': '_zap=bf590ffc-7f43-4bd3-81a9-276108aeca32; d_c0="AGAQ2IAynhSPTifTZlhG0aGx-xT5O1PWGFA=|1646993823"; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=oQufEoKCUqxEQUQQVFN7vj%2BRK954IGQP; _xsrf=jkPVzZ3V22iz9rIRU2PH6HsbNZxd3rtT; __snaker__id=eqk26FiFhZ35zYns; l_cap_id="MDIzYjc5OGQ0YzBkNGE5YzljYTQ2MmI4NzIxODFkN2M=|1647615417|c15eb705983f2776351272edc5ac3f37dc5f0249"; r_cap_id="NjJlNzBmMzFkMGI4NGQyZmJlOWEyZmM3NDM2NDliNzk=|1647615417|6440b2d6e9a6bd8d34e9c1bed02229a99f2de13b"; cap_id="ZmQyZTZhZTY0ZTdjNDAzMWE2YTczMDE5MDY0M2JiZjg=|1647615417|5c8acf67a0230bdea96a92f9a07ebefc9328445c"; q_c1=88d57b9097bf4b3b8b685aeaa3b0341e|1647615421000|1647615421000; captcha_session_v2=2|1:0|10:1647693634|18:captcha_session_v2|88:b2dwU25ENjJnQzV1YU5RdVNtelR2QUpNVDlUT05sUk5BbFhIRU9LWVdRK3M4Sm5WWkFuWGt6ZkhNWWppdzA1WA==|c431f279025ef554aeecfa938eb9da48c4f9e59251f7db9eac42cb1695a767db; gdxidpyhxdE=gD%5C3jD9CASMy0Rl9GUW7LDiqaZBQMol1r0%2F9Xx2v62jE6Q1yZOiZEU%5CoHJJ26OosJN8MBtALgPqhdSSpWIHAeaU1l1xpzhdNW37x1tgfybr98vA2BbLM%5CxnK%2F8uYsNAu20RoGXx%5CRzyqyHPbIyx8rfiYd%2BcUQ0eNhXL%5Cj76NskHwtdCg%3A1647694536428; YD00517437729195%3AWM_NI=9HvJ3O5NnTXRTVXX1nY1pB0TF9WCZ4X67w1UIEXeiBwAvITVoNIjzFrlfH2R5ZmL%2BF7B3OieTAVRC51DUjRG1xxPb37lsSVQeuOYH9erFFC7TOK8Bh2M6S9kY8niITAaZDI%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee9bb352aab3bea6dc5ca9928fb3d54a879f9faeaa7ef7a983a9e64195ebbe8ef42af0fea7c3b92afb98baabb245f792a2b5db5a988a98b9b4669befc0a3b36086e799a4d9409ceaf998b45f8b9b9886ec4ab59fbaaad75ead92878bf94eb2b4aaa9b15af786a3adaa4ef59efdb0ec6f989f9ab3c23ff39c9b8bd14df3a8b8b1e139f5b4bdb4d55ba7ab81bacb33a7ae8aa7c23d989185d9c164f293fab4bc619be8a28acc6083bf96a5cc37e2a3; captcha_ticket_v2=2|1:0|10:1647693657|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfV0pOV0kyOEowVGxKUzZjUk5adlJaaEtjTVB3cHRuR1pWa004R2Q4aHNiZkh3TmZ6ZDVDSGdqOUpsOGVqQy1tN2dHNGtQcjVDWEVtSko5dUFtSVRJX1MyLjR1V3NHVWhqTll5TGo0TGliZVZ1UWcyOUlGUE1HMWxxdG85c2FiYzdScVNLUWRESVkuUnVES1ZILVlHLWd1T1VsNTJEVGIteHd6dE8tYXhtYW5kV082U3ZOQmhKenVFbUN0eFlReEJ2aDhtc0JnV3VoUGNPRTdHWV91UEVMSUVUSFh3b2J2TS1rVTZiRGNNUWg2OUwueWRsTW9aZWRZd0VDdG03VmYxc0RfaFRORGNGMWkyRHRCc2psLUNiN1p3clA5dnVxVTBFQVZUQ0ktVUZZeGlkcG1kT3FfSFp6UmZsNnNoS1hFNVBDLV9SMGJKbmR3cllpWWFHbDhKNFBaQVFQQmFwMVVIYlk4RHd2c1hRSVJraFJUMjZvWThfZ1JzX0FHZzhRdHZBV0dZQlZDaVdEajdlOVJGN1BMU2p4LTh4THJCVTdLekhhQXhqUHZiOFRsTWNGeUpRSjguLlk0THV5cEZKT3BjOGZ1Zm9mQ2k5Wi5MWlFlOUhkRkprSkRCYVpnMlZMRVhYamZacGNXUF91OWlkMS1HX1g0VEdBSFliVFhJMyJ9|53c6f250f24fe7f385f94f9af6e1ca5f1e9e917fd328c603cf2de762c3ea74ab; z_c0=2|1:0|10:1647693675|4:z_c0|92:Mi4xM2h2b0ZRQUFBQUFBWUJEWWdES2VGQ1lBQUFCZ0FsVk5heDBqWXdEN3lBQWowN2NDdVhyblhTbUNPRHR2My04M2p3|762883be5c97e8c72970472db8a1f0b5cd2b1aa8d45cf93bb1bf648ab17002b2; NOT_UNREGISTER_WAITING=1; SESSIONID=WlASm6uFYXFdV7H8XeORE6GW4sThg7rFjIzMCyAfLAP; JOID=VF0SAUNrVdkzSYrXTG-YSKbFvERZPzudaiq_oA88Dp1SFbOcdhh1l1VEi9BDSVCDDPzrKj8jKtmBRtMTcn03C0k=; osd=V18cAUNoV9czSYnVQm-YS6TLvERaPTWdaim9rg88DZ9cFbOfdBZ1l1ZGhdBDSlKNDPzoKDEjKtqDSNMTcX85C0k=; tst=r; KLBRSID=d017ffedd50a8c265f0e648afe355952|1648437557|1648437102',
'referer': 'https://www.zhihu.com/topic/20760895/hot'
}
html = requests.get(url, headers=headers)
html.encoding = "Unicode"
return html.text
# 解析网页
def parse_json(html):
json_data = json.loads(html)
list = json_data['data']
nextUrl = json_data['paging']['next']
# # todo \
# if not list:
# return
for item in list:
type=item['target']['type']
if type=="answer":
#回答
question=item['target']['question']
id=question['id']
title=question['title']
url="http://api.zhihu.com/questions/"+str(id)
print("回答:",id,title)
#保存到数据库
insertDB(id,title,url)
elif type=="article":
# 专栏
zhuanlan=item['target']
id=zhuanlan["id"]
title=zhuanlan["title"]
url=zhuanlan["url"]
vote=zhuanlan['voteup_count']
cmts=zhuanlan['comment_count']
auth=zhuanlan['author']['name']
print("专栏:",id,title)
# 保存数据库
saveArticleDB(id,title,vote,cmts,auth,url)
elif type=="questions":
#问题
question=item['target']
id=question['id']
title=question['title']
url="http://api.zhihu.com/questions/"+str(id)
print("问题:",id,title)
#保存数据库
insertDB(id,title,url)
return nextUrl
# 讨论
def craw_1(id):
# url = "https://www.zhihu.com/api/v4/topics/" + id + "/feeds/essence?offset=10&limit=10&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,answer_type;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.paid_info;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;"
url="https://www.zhihu.com/api/v4/topics/"+id+"/feeds/essence?offset=10&limit=10&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,answer_type;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.paid_info;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;"
while url:
print(url)
html = index_web_page(url)
print(html)
# url=False
url = parse_json(html)
print(url)
print("讨论")
# 讨论
def craw_2(id):
url = "https://www.zhihu.com/api/v4/topics/" + str(
id) + "/feeds/top_activity?offset=10&limit=10&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,answer_type;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.paid_info;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;"
while url:
html = index_web_page(url)
url = parse_json(html)
print("讨论")
# 精华
def craw_3(id):
url="https://www.zhihu.com/api/v4/topics/"+str(id)+"/feeds/top_question?offset=20&limit=10&data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp"
while url:
html=index_web_page(url)
text=parse_json(html)
print("精华")
if __name__ == '__main__':
id="20760895"
t1=threading.Thread(target=craw_1,args=(id,))
t1.start()
# craw_1(id) #只有八条
# craw_2(id)