爬取的数据怎么进行数据整理?我想要content的数据,但是写了会爬不到
import requests
import re
import json
data_list = []
for i in range(1,2,1):
url = 'https://read.douban.com/j/graphql'
#访问的头文件,还带这个cookie
headers = {
"Accept": "application/json",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Content-Type": "application/json",
"Origin": "https://read.douban.com",
"Referer": "https://read.douban.com/ebook/329493032/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63",
"X-CSRF-Token": "HrwZ",
"X-Requested-With": "XMLHttpRequest",
"sec-ch-ua": "\"Chromium\";v=\"104\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"104\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
data = {
"query": "\n query getWorksComment($worksId: ID!, $limit: Int) {\n works: works(worksId: $worksId) {\n worksType\n \n ... on WorksBase {\n comments: mixedComments(limit: $limit) {\n \n ... on CommentBase {\n id\n isHidden\n isDeleted\n \n ... on CommentBase {\n id\n works {\n agent {\n id\n }\n }\n user {\n id\n avatar: picture(size: MEDIUM)\n name\n url\n isVip\n ... on Agent {\n agentName\n hasMedal\n agentId\n }\n }\n createTime\n commentType\n donation {\n amount\n }\n ... on Review {\n url\n badge {\n url\n image\n title\n color\n }\n }\n ... on Annotation {\n url\n }\n ... on WorksRecommend {\n score\n isEditorChoice\n }\n hasPurchasedAllBadge\n }\n\n \n ... on CommentBase {\n id\n content\n commentType\n user {\n id\n ... on Agent {\n agentName\n }\n }\n ... on Discussion {\n refComment {\n id\n user {\n id\n name\n url\n ... on Agent {\n agentName\n agentId\n }\n }\n isDeleted\n createTime\n content\n }\n }\n ... on Review {\n title\n badge {\n label\n color\n }\n }\n ... on Annotation {\n \n ... on Annotation {\n originContent {\n rawTexts\n startOffset\n endOffset\n image {\n url\n size { width height }\n }\n }\n }\n\n }\n ... on WorksRecommend {\n title\n }\n ... on ReviewComment {\n refComment {\n id\n content\n createTime\n isDeleted\n user {\n id\n name\n ... on Agent {\n agentName\n agentId\n }\n }\n }\n }\n ... on AnnotationComment {\n refComment {\n id\n content\n createTime\n isDeleted\n user {\n id\n name\n ... on Agent {\n agentName\n agentId\n }\n }\n }\n }\n ... on WorksRecommendComment {\n refComment {\n id\n content\n createTime\n isDeleted\n user {\n id\n name\n ... on Agent {\n agentName\n agentId\n }\n }\n }\n }\n }\n\n \n ... on CommentBase {\n id\n commentType\n isHidden\n isDeleted\n content\n user {\n id\n name\n isBlocked\n ... on Agent {\n agentName\n }\n }\n works {\n id\n title\n cover(useSmall: true)\n url\n isChapter\n }\n operationInfo {\n editor {\n id\n name\n url\n }\n time\n }\n ... on Review {\n url\n upvoted\n upvoteCount\n commentCount\n }\n ... on Annotation {\n url\n upvoted\n upvoteCount\n commentCount\n }\n ... on Discussion {\n targetId\n upvoted\n upvoteCount\n works {\n title\n url\n }\n }\n ... on WorksRecommend {\n url\n upvoted\n upvoteCount\n commentCount\n }\n ... on ReviewComment {\n targetId\n upvoted\n upvoteCount\n }\n ... on AnnotationComment {\n targetId\n upvoted\n upvoteCount\n }\n ... on WorksRecommendComment {\n targetId\n upvoted\n upvoteCount\n }\n }\n\n \n ... on WorksRecommend {\n id\n works {\n id\n \n title\n cover(useSmall: true)\n url\n isBundle\n coverLabel(preferVip: true)\n \n \n url\n title\n\n \n author {\n name\n url\n }\n origAuthor {\n name\n url\n }\n translator {\n name\n url\n }\n\n isColumn\n isFinished\n wordCount\n wordCountUnit\n isInLibrary\n \n }\n }\n\n \n ... on CommentBase {\n id\n commentType\n content\n works {\n id\n title\n }\n user {\n id\n name\n isBlocked\n ... on Agent {\n agentName\n }\n }\n isHidden\n isDeleted\n operationInfo {\n editor {\n id\n name\n url\n }\n time\n }\n ... on Review {\n title\n rating\n reviewId\n upvoted\n upvoteCount\n commentCount\n }\n ... on Annotation {\n upvoted\n upvoteCount\n commentCount\n }\n ... on WorksRecommend {\n upvoted\n upvoteCount\n commentCount\n }\n ... on Discussion {\n targetId\n }\n ... on ReviewComment {\n targetId\n }\n ... on AnnotationComment {\n targetId\n }\n ... on WorksRecommendComment {\n targetId\n }\n }\n \n ... on CommentBase {\n id\n commentType\n content\n works {\n id\n }\n user {\n id\n name\n ... on Agent {\n agentName\n }\n }\n ... on Review {\n reviewId\n }\n }\n\n \n ... on CommentBase {\n id\n commentType\n works {\n id\n }\n ... on Review {\n reviewId\n }\n ... on Discussion {\n targetId\n }\n ... on ReviewComment {\n targetId\n }\n ... on AnnotationComment {\n targetId\n }\n ... on WorksRecommendComment {\n targetId\n }\n }\n\n\n }\n\n }\n commentTotal: mixedCommentCount\n }\n \n ... on WorksBase {\n id\n title\n review {\n id\n content\n ... on Review {\n reviewId\n rating\n title\n url\n }\n }\n }\n\n \n }\n }\n ",
"variables": {
"worksId": "329493032",
"limit": 6
},
"operationName": "getWorksComment"
}
cookies = {
"bid": "CtjGQQts6qw",
"_ga": "GA1.3.854662765.1661781900",
"_gid": "GA1.3.317441563.1661781904",
"__gads": "ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg",
"ll": "\"108288\"",
"__utma": "30149280.854662765.1661781900.1661864911.1661864911.1",
"__utmz": "30149280.1661864911.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
"dbcl2": "\"262046614:rlftVkO/IYI\"",
"ck": "HrwZ",
"_pk_ses.100001.a7dd": "*",
"__gpi": "UID=0000093d4aa6ff27:T=1661830434:RT=1661911407:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g",
"_gat": "1",
"_pk_id.100001.a7dd": "684c82dba8991abc.1661781900.7.1661911810.1661865165.",
"_ga_RXNMP372GL": "GS1.1.1661911094.10.1.1661911816.50.0.0"
}
#尝试获取数据(这里的数据应该是从json里面获取的)
try:
data = json.dumps(data)
response = requests.post(url, headers=headers,cookies=cookies ,data=data)
content = re.compile('"content":(.*?)').findall(response)
print(content.text)
except:
print("本页爬取失败")