weixin_63967673 2022-08-31 11:54 采纳率: 94.1%
浏览 31
已结题

要content的数据,但是写了会爬不到

爬取的数据怎么进行数据整理?我想要content的数据,但是写了会爬不到


import requests
import re
import json
data_list = []
for i in range(1,2,1):
    url = 'https://read.douban.com/j/graphql'
    #访问的头文件,还带这个cookie
    headers = {
        "Accept": "application/json",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
        "Connection": "keep-alive",
        "Content-Type": "application/json",
        "Origin": "https://read.douban.com",
        "Referer": "https://read.douban.com/ebook/329493032/",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63",
        "X-CSRF-Token": "HrwZ",
        "X-Requested-With": "XMLHttpRequest",
        "sec-ch-ua": "\"Chromium\";v=\"104\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"104\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\""
    }
    data = {
        "query": "\n    query getWorksComment($worksId: ID!, $limit: Int) {\n      works: works(worksId: $worksId) {\n        worksType\n        \n    ... on WorksBase {\n      comments: mixedComments(limit: $limit) {\n        \n  ... on CommentBase {\n    id\n    isHidden\n    isDeleted\n    \n  ... on CommentBase {\n    id\n    works {\n      agent {\n        id\n      }\n    }\n    user {\n      id\n      avatar: picture(size: MEDIUM)\n      name\n      url\n      isVip\n      ... on Agent {\n        agentName\n        hasMedal\n        agentId\n      }\n    }\n    createTime\n    commentType\n    donation {\n      amount\n    }\n    ... on Review {\n      url\n      badge {\n        url\n        image\n        title\n        color\n      }\n    }\n    ... on Annotation {\n      url\n    }\n    ... on WorksRecommend {\n      score\n      isEditorChoice\n    }\n    hasPurchasedAllBadge\n  }\n\n    \n  ... on CommentBase {\n    id\n    content\n    commentType\n    user {\n      id\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Discussion {\n      refComment {\n        id\n        user {\n          id\n          name\n          url\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n        isDeleted\n        createTime\n        content\n      }\n    }\n    ... on Review {\n      title\n      badge {\n        label\n        color\n      }\n    }\n    ... on Annotation {\n      \n  ... on Annotation {\n    originContent {\n      rawTexts\n      startOffset\n      endOffset\n      image {\n        url\n        size { width height }\n      }\n    }\n  }\n\n    }\n    ... on WorksRecommend {\n      title\n    }\n    ... on ReviewComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on AnnotationComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n    ... on WorksRecommendComment {\n      refComment {\n        id\n        content\n        createTime\n        isDeleted\n        user {\n          id\n          name\n          ... on Agent {\n            agentName\n            agentId\n          }\n        }\n      }\n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    isHidden\n    isDeleted\n    content\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    works {\n      id\n      title\n      cover(useSmall: true)\n      url\n      isChapter\n    }\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n      upvoted\n      upvoteCount\n      works {\n        title\n        url\n      }\n    }\n    ... on WorksRecommend {\n      url\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on ReviewComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on AnnotationComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n    ... on WorksRecommendComment {\n      targetId\n      upvoted\n      upvoteCount\n    }\n  }\n\n    \n  ... on WorksRecommend {\n    id\n    works {\n      id\n      \n    title\n    cover(useSmall: true)\n    url\n    isBundle\n    coverLabel(preferVip: true)\n  \n      \n  url\n  title\n\n      \n  author {\n    name\n    url\n  }\n  origAuthor {\n    name\n    url\n  }\n  translator {\n    name\n    url\n  }\n\n      isColumn\n      isFinished\n      wordCount\n      wordCountUnit\n      isInLibrary\n      \n    }\n  }\n\n    \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n      title\n    }\n    user {\n      id\n      name\n      isBlocked\n      ... on Agent {\n        agentName\n      }\n    }\n    isHidden\n    isDeleted\n    operationInfo {\n      editor {\n        id\n        name\n        url\n      }\n      time\n    }\n    ... on Review {\n      title\n      rating\n      reviewId\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Annotation {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on WorksRecommend {\n      upvoted\n      upvoteCount\n      commentCount\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n  \n  ... on CommentBase {\n    id\n    commentType\n    content\n    works {\n      id\n    }\n    user {\n      id\n      name\n      ... on Agent {\n        agentName\n      }\n    }\n    ... on Review {\n      reviewId\n    }\n  }\n\n  \n  ... on CommentBase {\n    id\n    commentType\n    works {\n      id\n    }\n    ... on Review {\n      reviewId\n    }\n    ... on Discussion {\n      targetId\n    }\n    ... on ReviewComment {\n      targetId\n    }\n    ... on AnnotationComment {\n      targetId\n    }\n    ... on WorksRecommendComment {\n      targetId\n    }\n  }\n\n\n  }\n\n      }\n      commentTotal: mixedCommentCount\n    }\n    \n  ... on WorksBase {\n    id\n    title\n    review {\n      id\n      content\n      ... on Review {\n        reviewId\n        rating\n        title\n        url\n      }\n    }\n  }\n\n  \n      }\n    }\n  ",
        "variables": {
            "worksId": "329493032",
            "limit": 6
        },
        "operationName": "getWorksComment"
    }
    cookies = {
        "bid": "CtjGQQts6qw",
        "_ga": "GA1.3.854662765.1661781900",
        "_gid": "GA1.3.317441563.1661781904",
        "__gads": "ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg",
        "ll": "\"108288\"",
        "__utma": "30149280.854662765.1661781900.1661864911.1661864911.1",
        "__utmz": "30149280.1661864911.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
        "dbcl2": "\"262046614:rlftVkO/IYI\"",
        "ck": "HrwZ",
        "_pk_ses.100001.a7dd": "*",
        "__gpi": "UID=0000093d4aa6ff27:T=1661830434:RT=1661911407:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g",
        "_gat": "1",
        "_pk_id.100001.a7dd": "684c82dba8991abc.1661781900.7.1661911810.1661865165.",
        "_ga_RXNMP372GL": "GS1.1.1661911094.10.1.1661911816.50.0.0"
    }
    #尝试获取数据(这里的数据应该是从json里面获取的)
    try:
        data = json.dumps(data)
        response = requests.post(url, headers=headers,cookies=cookies ,data=data)
        content = re.compile('"content":(.*?)').findall(response)
        print(content.text)
    except:
        print("本页爬取失败")

  • 写回答

2条回答 默认 最新

  • 亖夕 Python领域新星创作者 2022-08-31 12:25
    关注

    正则写错了

    img

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(1条)

报告相同问题?

问题事件

  • 系统已结题 9月8日
  • 已采纳回答 8月31日
  • 创建了问题 8月31日

悬赏问题

  • ¥15 下图接收小电路,谁知道原理
  • ¥15 装 pytorch 的时候出了好多问题,遇到这种情况怎么处理?
  • ¥20 IOS游览器某宝手机网页版自动立即购买JavaScript脚本
  • ¥15 手机接入宽带网线,如何释放宽带全部速度
  • ¥30 关于#r语言#的问题:如何对R语言中mfgarch包中构建的garch-midas模型进行样本内长期波动率预测和样本外长期波动率预测
  • ¥15 ETLCloud 处理json多层级问题
  • ¥15 matlab中使用gurobi时报错
  • ¥15 这个主板怎么能扩出一两个sata口
  • ¥15 不是,这到底错哪儿了😭
  • ¥15 2020长安杯与连接网探