weixin_63967673 2022-09-14 23:27 采纳率: 94.1%
浏览 22
已结题

关于爬虫爬取页数的问题

翻页爬取,不单单只爬一页,但是这个只能爬一夜,无论怎么改Referer的page都不行

import requests
import re
import json
import time
for i in range(1,20,1):
 print("正在爬取" +str(i))
 headers = {
    "Accept": "application/json",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Connection": "keep-alive",
    "Content-Type": "application/json",
    "Origin": "https://read.douban.com",
    "Referer": "https://read.douban.com/ebook/57468782/comments?page=" +str(i),
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63",
    "X-CSRF-Token": "HrwZ",
    "X-Requested-With": "XMLHttpRequest",
    "sec-ch-ua": "\"Chromium\";v=\"104\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"104\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\""
 }
 cookies = {
     "bid": "CtjGQQts6qw",
     "_ga": "GA1.3.854662765.1661781900",
     "__gads": "ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg",
     "ll": "\"108288\"",
     "__utma": "30149280.854662765.1661781900.1661864911.1661864911.1",
     "__utmz": "30149280.1661864911.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
     "dbcl2": "\"262046614:rlftVkO/IYI\"",
     "ck": "HrwZ",
     "_pk_ses.100001.a7dd": "*",
     "_gid": "GA1.3.1915073618.1662102188",
     "_gat": "1",
     "__gpi": "UID=0000093d4aa6ff27:T=1661830434:RT=1662102190:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g",
     "_ga_RXNMP372GL": "GS1.1.1662102176.14.1.1662102202.34.0.0",
     "_pk_id.100001.a7dd": "684c82dba8991abc.1661781900.10.1662102203.1661928898."
 }
 url = "https://read.douban.com/j/graphql"
 data = {
     "query": "\n    query getWorksComment($worksId: ID!, $start: Int, $limit: Int, $sort: ReviewSortEnum, $onlyUserId: ID, $onlyCompetition: Boolean, $reviewType: String) {\n      works: works(worksId: $worksId) {\n        worksType\n        \n  ... on WorksBase {\n    mixedCommentCount\n  }\n\n\n  ... on WorksBase {\n    isCompetition\n  }\n\n\n    ... on WorksBase {\n      comments: reviews(\n        start: $start, limit: $limit, sort: $sort, onlyUserId: $onlyUserId, competitionOnly: $onlyCompetition, lengthType: $reviewType\n      ) {\n        list {\n          \n  ... on CommentBase {\n    id\n    isHidden\n    isDeleted\n    \n  ... on CommentBase {\n    id\n    works {\n      agent {\n        id\n      }\n      \n  title\n  url\n  isChapter\n\n    }\n    user {\n      id\n      avatar: picture(size: MEDIUM)\n      name\n      url\n      isVip\n      ... on Agent {\n        agentName\n        hasMedal\n        agentId\n      }\n    }\n    createTime\n    commentType\n    donation {\n      amount\n    }\n    ... on Review {\n      url\n      badge {\n        url\n        image\n        title\n        color\n      }\n    }\n    ... on Annotation {\n      url\n    }\n    ... on WorksRecommend {\n      score\n      isEditorChoice\n    }\n    hasPurchasedAllBadge\n  }\n\n    \n  ... on CommentBase {\n    id\n    content\n    commentType\n    user {\n      id\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Discussion {\n      refComment {\n        id\n        user {\n          id\n          name\n          url\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n        isDeleted\n        createTime\n        content\n      }\n    }\n    ... on Review {\n      title\n      badge {\n        label\n        color\n      }\n    }\n    ... on Annotation {\n      \n  ... on Annotation {\n    originContent {\n      rawTexts\n      startOffset\n      endOffset\n      image {\n        url\n        size { width height }\n      }\n    }\n  }\n\n    }\n    ... on WorksRecommend {\n      title\n    }\n    ... on ReviewComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on AnnotationComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on WorksRecommendComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    isHidden\n    isDeleted\n    content\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    works {\n      id\n      cover(useSmall: true)\n      \n  title\n  url\n  isChapter\n\n    }\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n      upvoted\n      upvoteCount\n      works {\n        title\n        url\n      }\n    }\n    ... on WorksRecommend {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on ReviewComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on AnnotationComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on WorksRecommendComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n  }\n\n    \n  ... on WorksRecommend {\n    id\n    works {\n      id\n      \n    title\n    cover(useSmall: true)\n    url\n    isBundle\n    coverLabel(preferVip: true)\n  \n      \n  url\n  title\n\n      \n  author {\n    name\n    url\n  }\n  origAuthor {\n    name\n    url\n  }\n  translator {\n    name\n    url\n  }\n\n      isColumn\n      isFinished\n      wordCount\n      wordCountUnit\n      isInLibrary\n      \n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n      title\n    }\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    isHidden\n    isDeleted\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      title\n      rating\n      reviewId\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on WorksRecommend {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n  \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n    }\n    user {\n      id\n      name\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Review {\n      reviewId\n    }\n  }\n\n  \n  ... on CommentBase {\n    id\n    commentType\n    works {\n      id\n    }\n    ... on Review {\n      reviewId\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n\n\n  }\n\n        }\n        total\n      }\n    }\n  \n      }\n    }\n  ",
     "variables": {
         "worksId": "57468782",
         "start": 20,
         "limit": 20,
         "sort": "SCORE_DESC",
         "onlyUserId": None,
         "onlyCompetition": False
     },
     "operationName": "getWorksComment"
 }
 try:
         data = json.dumps(data)
         response = requests.post(url, headers=headers,cookies=cookies ,data=data)
         time.sleep(5)
         conzhen = 'content":"(.*?)","'
         content = re.compile(conzhen).findall(response.text)
 except:
         print("本页爬取失败")
 fh = open('测试写入.txt', 'a')
 for i in range(1 , len(content)):
     print(content[i])
     fh.write(content[i] + '\n')
 fh.close()


  • 写回答

2条回答 默认 最新

  • 梦里逆天 2022-09-15 00:04
    关注

    img

    import requests
    import re
    import json
    import time
    
    fh = open('测试写入.txt', 'a')
    for i in range(1, 20, 1):
        print("正在爬取" + str(i))
        headers = {
            "Accept": "application/json",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
            "Connection": "keep-alive",
            "Content-Type": "application/json",
            "Origin": "https://read.douban.com",
            "Referer": "https://read.douban.com/ebook/57468782/comments?page=" + str(i),
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63",
            "X-CSRF-Token": "HrwZ",
            "X-Requested-With": "XMLHttpRequest",
            "sec-ch-ua": "Chromium;v=104, Not A;Brand;v=99, Microsoft Edge;v=104",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "Windows"
        }
        cookies = {
            "bid": "CtjGQQts6qw",
            "_ga": "GA1.3.854662765.1661781900",
            "__gads": "ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg",
            "ll": "108288",
            "__utma": "30149280.854662765.1661781900.1661864911.1661864911.1",
            "__utmz": "30149280.1661864911.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
            "dbcl2": "262046614:rlftVkO/IYI",
            "ck": "HrwZ",
            "_pk_ses.100001.a7dd": "*",
            "_gid": "GA1.3.1915073618.1662102188",
            "_gat": "1",
            "__gpi": "UID=0000093d4aa6ff27:T=1661830434:RT=1662102190:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g",
            "_ga_RXNMP372GL": "GS1.1.1662102176.14.1.1662102202.34.0.0",
            "_pk_id.100001.a7dd": "684c82dba8991abc.1661781900.10.1662102203.1661928898."
        }
        url = "https://read.douban.com/j/graphql"
        data = {
            "query": "query getWorksComment($worksId: ID!, $start: Int, $limit: Int, $sort: ReviewSortEnum, $onlyUserId: ID, $onlyCompetition: Boolean, $reviewType: String) {\n      works: works(worksId: $worksId) {\n        worksType\n        \n  ... on WorksBase {\n    mixedCommentCount\n  }\n\n\n  ... on WorksBase {\n    isCompetition\n  }\n\n\n    ... on WorksBase {\n      comments: reviews(\n        start: $start, limit: $limit, sort: $sort, onlyUserId: $onlyUserId, competitionOnly: $onlyCompetition, lengthType: $reviewType\n      ) {\n        list {\n          \n  ... on CommentBase {\n    id\n    isHidden\n    isDeleted\n    \n  ... on CommentBase {\n    id\n    works {\n      agent {\n        id\n      }\n      \n  title\n  url\n  isChapter\n\n    }\n    user {\n      id\n      avatar: picture(size: MEDIUM)\n      name\n      url\n      isVip\n      ... on Agent {\n        agentName\n        hasMedal\n        agentId\n      }\n    }\n    createTime\n    commentType\n    donation {\n      amount\n    }\n    ... on Review {\n      url\n      badge {\n        url\n        image\n        title\n        color\n      }\n    }\n    ... on Annotation {\n      url\n    }\n    ... on WorksRecommend {\n      score\n      isEditorChoice\n    }\n    hasPurchasedAllBadge\n  }\n\n    \n  ... on CommentBase {\n    id\n    content\n    commentType\n    user {\n      id\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Discussion {\n      refComment {\n        id\n        user {\n          id\n          name\n          url\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n        isDeleted\n        createTime\n        content\n      }\n    }\n    ... on Review {\n      title\n      badge {\n        label\n        color\n      }\n    }\n    ... on Annotation {\n      \n  ... on Annotation {\n    originContent {\n      rawTexts\n      startOffset\n      endOffset\n      image {\n        url\n        size { width height }\n      }\n    }\n  }\n\n    }\n    ... on WorksRecommend {\n      title\n    }\n    ... on ReviewComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on AnnotationComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on WorksRecommendComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    isHidden\n    isDeleted\n    content\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    works {\n      id\n      cover(useSmall: true)\n      \n  title\n  url\n  isChapter\n\n    }\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n      upvoted\n      upvoteCount\n      works {\n        title\n        url\n      }\n    }\n    ... on WorksRecommend {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on ReviewComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on AnnotationComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on WorksRecommendComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n  }\n\n    \n  ... on WorksRecommend {\n    id\n    works {\n      id\n      \n    title\n    cover(useSmall: true)\n    url\n    isBundle\n    coverLabel(preferVip: true)\n  \n      \n  url\n  title\n\n      \n  author {\n    name\n    url\n  }\n  origAuthor {\n    name\n    url\n  }\n  translator {\n    name\n    url\n  }\n\n      isColumn\n      isFinished\n      wordCount\n      wordCountUnit\n      isInLibrary\n      \n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n      title\n    }\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    isHidden\n    isDeleted\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      title\n      rating\n      reviewId\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on WorksRecommend {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n  \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n    }\n    user {\n      id\n      name\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Review {\n      reviewId\n    }\n  }\n\n  \n  ... on CommentBase {\n    id\n    commentType\n    works {\n      id\n    }\n    ... on Review {\n      reviewId\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n\n\n  }\n\n        }\n        total\n      }\n    }\n  \n      }\n    }\n  ",
            "variables": {
                "worksId": "57468782",
                "start": 20*i,
                "limit": 20,
                "sort": "SCORE_DESC",
                "onlyUserId": None,
                "onlyCompetition": False
            },
            "operationName": "getWorksComment"
        }
        try:
            data = json.dumps(data)
            response = requests.post(url, headers=headers, cookies=cookies, data=data)
            time.sleep(5)
            conzhen = 'content":"(.*?)","'
            content = re.compile(conzhen).findall(response.text)
            print(len(content))
            for i in range(1, len(content)):
                print(content[i])
                fh.write(content[i] + '\n')
        except:
            print("本页爬取失败")
    fh.close()
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(1条)

报告相同问题?

问题事件

  • 系统已结题 9月23日
  • 已采纳回答 9月15日
  • 创建了问题 9月14日

悬赏问题

  • ¥20 wireshark抓不到vlan
  • ¥20 关于#stm32#的问题:需要指导自动酸碱滴定仪的原理图程序代码及仿真
  • ¥20 设计一款异域新娘的视频相亲软件需要哪些技术支持
  • ¥15 stata安慰剂检验作图但是真实值不出现在图上
  • ¥15 c程序不知道为什么得不到结果
  • ¥40 复杂的限制性的商函数处理
  • ¥15 程序不包含适用于入口点的静态Main方法
  • ¥15 素材场景中光线烘焙后灯光失效
  • ¥15 请教一下各位,为什么我这个没有实现模拟点击
  • ¥15 执行 virtuoso 命令后,界面没有,cadence 启动不起来