不良使 2022-03-28 17:09 采纳率: 100%
浏览 47
已结题

知乎的接口变了,提取的html网页打开没有数据怎么解决

问题遇到的现象和发生背景

通过 nextUrl = json_data['paging']['next'] 返回的数据

https://www.zhihu.com/topics/20760895/feeds/essence_v4?offset=20&limit=10&include=data

点击里面没有数据

网页中实际的url

https://www.zhihu.com/api/v4/topics/20760895/feeds/essence?offset=20&limit=10&include=data

点击有数据 ,你会发现两个网址不一样

如果 nextUrl = json_data['paging']['next'] 定位执行正确,应该返回的是第二个地址,但是实际上返回了第一个地址。
也就是定位出错,但是按照json定位应该是没有问题的

问题相关代码,请勿粘贴截图
# TODO                鸟欲高飞,必先展翅
# TODO                 向前的人 :Jhon
import pymysql  # 请求库
import requests  # 解析库
import json  # 数据保存库
import threading  # 多线程,提升爬取速度

# todo 连接数据库
db = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="zhihu")
# todo  创建游标对象
cursor = db.cursor()


# todo 创建数据库操作函数
def insertDB(id, title, url):
    try:
        # 插入语句
        sql = "insert into questions values(%s,'%s','%s')" % (id, title, url)
        cursor.execute(sql)
        db.commit()  # 提交    提交了才能成功
    except Exception as e:
        db.rollback()  # 错误了就回调
        print(e)


def saveArticleDB(id, title, vote, cmts, auth, url):
    # 插入语句
    try:
        sql = "insert into article values(%s,'%s','%s','%s','%s','%s')" % (id, title, vote, cmts, auth, url)
        cursor.execute(sql)
        db.commit()
    except Exception as e:
        db.rollback()
        print(e)


def index_web_page(url):
    # 发送网络请求,获取数据
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36",
        'cookie': '_zap=bf590ffc-7f43-4bd3-81a9-276108aeca32; d_c0="AGAQ2IAynhSPTifTZlhG0aGx-xT5O1PWGFA=|1646993823"; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=oQufEoKCUqxEQUQQVFN7vj%2BRK954IGQP; _xsrf=jkPVzZ3V22iz9rIRU2PH6HsbNZxd3rtT; __snaker__id=eqk26FiFhZ35zYns; l_cap_id="MDIzYjc5OGQ0YzBkNGE5YzljYTQ2MmI4NzIxODFkN2M=|1647615417|c15eb705983f2776351272edc5ac3f37dc5f0249"; r_cap_id="NjJlNzBmMzFkMGI4NGQyZmJlOWEyZmM3NDM2NDliNzk=|1647615417|6440b2d6e9a6bd8d34e9c1bed02229a99f2de13b"; cap_id="ZmQyZTZhZTY0ZTdjNDAzMWE2YTczMDE5MDY0M2JiZjg=|1647615417|5c8acf67a0230bdea96a92f9a07ebefc9328445c"; q_c1=88d57b9097bf4b3b8b685aeaa3b0341e|1647615421000|1647615421000; captcha_session_v2=2|1:0|10:1647693634|18:captcha_session_v2|88:b2dwU25ENjJnQzV1YU5RdVNtelR2QUpNVDlUT05sUk5BbFhIRU9LWVdRK3M4Sm5WWkFuWGt6ZkhNWWppdzA1WA==|c431f279025ef554aeecfa938eb9da48c4f9e59251f7db9eac42cb1695a767db; gdxidpyhxdE=gD%5C3jD9CASMy0Rl9GUW7LDiqaZBQMol1r0%2F9Xx2v62jE6Q1yZOiZEU%5CoHJJ26OosJN8MBtALgPqhdSSpWIHAeaU1l1xpzhdNW37x1tgfybr98vA2BbLM%5CxnK%2F8uYsNAu20RoGXx%5CRzyqyHPbIyx8rfiYd%2BcUQ0eNhXL%5Cj76NskHwtdCg%3A1647694536428; YD00517437729195%3AWM_NI=9HvJ3O5NnTXRTVXX1nY1pB0TF9WCZ4X67w1UIEXeiBwAvITVoNIjzFrlfH2R5ZmL%2BF7B3OieTAVRC51DUjRG1xxPb37lsSVQeuOYH9erFFC7TOK8Bh2M6S9kY8niITAaZDI%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee9bb352aab3bea6dc5ca9928fb3d54a879f9faeaa7ef7a983a9e64195ebbe8ef42af0fea7c3b92afb98baabb245f792a2b5db5a988a98b9b4669befc0a3b36086e799a4d9409ceaf998b45f8b9b9886ec4ab59fbaaad75ead92878bf94eb2b4aaa9b15af786a3adaa4ef59efdb0ec6f989f9ab3c23ff39c9b8bd14df3a8b8b1e139f5b4bdb4d55ba7ab81bacb33a7ae8aa7c23d989185d9c164f293fab4bc619be8a28acc6083bf96a5cc37e2a3; captcha_ticket_v2=2|1:0|10:1647693657|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfV0pOV0kyOEowVGxKUzZjUk5adlJaaEtjTVB3cHRuR1pWa004R2Q4aHNiZkh3TmZ6ZDVDSGdqOUpsOGVqQy1tN2dHNGtQcjVDWEVtSko5dUFtSVRJX1MyLjR1V3NHVWhqTll5TGo0TGliZVZ1UWcyOUlGUE1HMWxxdG85c2FiYzdScVNLUWRESVkuUnVES1ZILVlHLWd1T1VsNTJEVGIteHd6dE8tYXhtYW5kV082U3ZOQmhKenVFbUN0eFlReEJ2aDhtc0JnV3VoUGNPRTdHWV91UEVMSUVUSFh3b2J2TS1rVTZiRGNNUWg2OUwueWRsTW9aZWRZd0VDdG03VmYxc0RfaFRORGNGMWkyRHRCc2psLUNiN1p3clA5dnVxVTBFQVZUQ0ktVUZZeGlkcG1kT3FfSFp6UmZsNnNoS1hFNVBDLV9SMGJKbmR3cllpWWFHbDhKNFBaQVFQQmFwMVVIYlk4RHd2c1hRSVJraFJUMjZvWThfZ1JzX0FHZzhRdHZBV0dZQlZDaVdEajdlOVJGN1BMU2p4LTh4THJCVTdLekhhQXhqUHZiOFRsTWNGeUpRSjguLlk0THV5cEZKT3BjOGZ1Zm9mQ2k5Wi5MWlFlOUhkRkprSkRCYVpnMlZMRVhYamZacGNXUF91OWlkMS1HX1g0VEdBSFliVFhJMyJ9|53c6f250f24fe7f385f94f9af6e1ca5f1e9e917fd328c603cf2de762c3ea74ab; z_c0=2|1:0|10:1647693675|4:z_c0|92:Mi4xM2h2b0ZRQUFBQUFBWUJEWWdES2VGQ1lBQUFCZ0FsVk5heDBqWXdEN3lBQWowN2NDdVhyblhTbUNPRHR2My04M2p3|762883be5c97e8c72970472db8a1f0b5cd2b1aa8d45cf93bb1bf648ab17002b2; NOT_UNREGISTER_WAITING=1; SESSIONID=WlASm6uFYXFdV7H8XeORE6GW4sThg7rFjIzMCyAfLAP; JOID=VF0SAUNrVdkzSYrXTG-YSKbFvERZPzudaiq_oA88Dp1SFbOcdhh1l1VEi9BDSVCDDPzrKj8jKtmBRtMTcn03C0k=; osd=V18cAUNoV9czSYnVQm-YS6TLvERaPTWdaim9rg88DZ9cFbOfdBZ1l1ZGhdBDSlKNDPzoKDEjKtqDSNMTcX85C0k=; tst=r; KLBRSID=d017ffedd50a8c265f0e648afe355952|1648437557|1648437102',
         'referer': 'https://www.zhihu.com/topic/20760895/hot'
    }
    html = requests.get(url, headers=headers)
    html.encoding = "Unicode"
    return html.text


# 解析网页
def parse_json(html):
    json_data = json.loads(html)
    list = json_data['data']
    nextUrl = json_data['paging']['next']

    # # todo \
    # if not list:
    #     return
    for item in list:
        type=item['target']['type']
        if type=="answer":
            #回答
            question=item['target']['question']
            id=question['id']
            title=question['title']
            url="http://api.zhihu.com/questions/"+str(id)
            print("回答:",id,title)
            #保存到数据库
            insertDB(id,title,url)
        elif type=="article":
            # 专栏
            zhuanlan=item['target']
            id=zhuanlan["id"]
            title=zhuanlan["title"]
            url=zhuanlan["url"]
            vote=zhuanlan['voteup_count']
            cmts=zhuanlan['comment_count']
            auth=zhuanlan['author']['name']
            print("专栏:",id,title)
            #  保存数据库
            saveArticleDB(id,title,vote,cmts,auth,url)
        elif type=="questions":
            #问题
            question=item['target']
            id=question['id']
            title=question['title']
            url="http://api.zhihu.com/questions/"+str(id)
            print("问题:",id,title)
            #保存数据库
            insertDB(id,title,url)
    return nextUrl

# 讨论
def craw_1(id):
    # url = "https://www.zhihu.com/api/v4/topics/" + id + "/feeds/essence?offset=10&limit=10&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,answer_type;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.paid_info;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;"
    url="https://www.zhihu.com/api/v4/topics/"+id+"/feeds/essence?offset=10&limit=10&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,answer_type;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.paid_info;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;"
    while url:
        print(url)
        html = index_web_page(url)
        print(html)
        # url=False
        url = parse_json(html)
        print(url)
        print("讨论")


# 讨论
def craw_2(id):
    url = "https://www.zhihu.com/api/v4/topics/" + str(
        id) + "/feeds/top_activity?offset=10&limit=10&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp;data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=article)].target.content,voteup_count,comment_count,voting,author.badge[?(type=best_answerer)].topics;data[?(target.type=topic_sticky_module)].target.data[?(target.type=people)].target.answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.annotation_detail,content,hermes_label,is_labeled,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,answer_type;data[?(target.type=answer)].target.author.badge[?(type=best_answerer)].topics;data[?(target.type=answer)].target.paid_info;data[?(target.type=article)].target.annotation_detail,content,hermes_label,is_labeled,author.badge[?(type=best_answerer)].topics;data[?(target.type=question)].target.annotation_detail,comment_count;"
    while url:
        html = index_web_page(url)
        url = parse_json(html)
        print("讨论")


# 精华
def craw_3(id):
    url="https://www.zhihu.com/api/v4/topics/"+str(id)+"/feeds/top_question?offset=20&limit=10&data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.is_normal,comment_count,voteup_count,content,relevant_info,excerpt.author.badge[?(type=best_answerer)].topics&include=data[?(target.type=topic_sticky_module)].target.data[?(target.type=answer)].target.content,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp"
    while url:
        html=index_web_page(url)
        text=parse_json(html)
        print("精华")


if __name__ == '__main__':
    id="20760895"
    t1=threading.Thread(target=craw_1,args=(id,))
    t1.start()
    # craw_1(id)    #只有八条
    # craw_2(id)
    

运行结果及报错内容
我的解答思路和尝试过的方法
我想要达到的结果
  • 写回答

0条回答 默认 最新

    报告相同问题?

    问题事件

    • 系统已结题 4月5日
    • 创建了问题 3月28日

    悬赏问题

    • ¥15 is not in the mmseg::model registry。报错,模型注册表找不到自定义模块。
    • ¥15 安装quartus II18.1时弹出此error,怎么解决?
    • ¥15 keil官网下载psn序列号在哪
    • ¥15 想用adb命令做一个通话软件,播放录音
    • ¥30 Pytorch深度学习服务器跑不通问题解决?
    • ¥15 部分客户订单定位有误的问题
    • ¥15 如何在maya程序中利用python编写领子和褶裥的模型的方法
    • ¥15 Bug traq 数据包 大概什么价
    • ¥15 在anaconda上pytorch和paddle paddle下载报错
    • ¥25 自动填写QQ腾讯文档收集表