翻页爬取,不单单只爬一页,但是这个只能爬一夜,无论怎么改Referer的page都不行
import requests
import re
import json
import time
for i in range(1,20,1):
print("正在爬取" +str(i))
headers = {
"Accept": "application/json",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Content-Type": "application/json",
"Origin": "https://read.douban.com",
"Referer": "https://read.douban.com/ebook/57468782/comments?page=" +str(i),
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63",
"X-CSRF-Token": "HrwZ",
"X-Requested-With": "XMLHttpRequest",
"sec-ch-ua": "\"Chromium\";v=\"104\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"104\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
cookies = {
"bid": "CtjGQQts6qw",
"_ga": "GA1.3.854662765.1661781900",
"__gads": "ID=d5118ecfd9b7e0af-2274d7276ad60026:T=1661830434:RT=1661830434:S=ALNI_Mal2pDexGkPbLkfBDTGaasIQBwdVg",
"ll": "\"108288\"",
"__utma": "30149280.854662765.1661781900.1661864911.1661864911.1",
"__utmz": "30149280.1661864911.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
"dbcl2": "\"262046614:rlftVkO/IYI\"",
"ck": "HrwZ",
"_pk_ses.100001.a7dd": "*",
"_gid": "GA1.3.1915073618.1662102188",
"_gat": "1",
"__gpi": "UID=0000093d4aa6ff27:T=1661830434:RT=1662102190:S=ALNI_MaJaflHC8_SKOY1tp75x72g4tBu-g",
"_ga_RXNMP372GL": "GS1.1.1662102176.14.1.1662102202.34.0.0",
"_pk_id.100001.a7dd": "684c82dba8991abc.1661781900.10.1662102203.1661928898."
}
url = "https://read.douban.com/j/graphql"
data = {
"query": "\n query getWorksComment($worksId: ID!, $start: Int, $limit: Int, $sort: ReviewSortEnum, $onlyUserId: ID, $onlyCompetition: Boolean, $reviewType: String) {\n works: works(worksId: $worksId) {\n worksType\n \n ... on WorksBase {\n mixedCommentCount\n }\n\n\n ... on WorksBase {\n isCompetition\n }\n\n\n ... on WorksBase {\n comments: reviews(\n start: $start, limit: $limit, sort: $sort, onlyUserId: $onlyUserId, competitionOnly: $onlyCompetition, lengthType: $reviewType\n ) {\n list {\n \n ... on CommentBase {\n id\n isHidden\n isDeleted\n \n ... on CommentBase {\n id\n works {\n agent {\n id\n }\n \n title\n url\n isChapter\n\n }\n user {\n id\n avatar: picture(size: MEDIUM)\n name\n url\n isVip\n ... on Agent {\n agentName\n hasMedal\n agentId\n }\n }\n createTime\n commentType\n donation {\n amount\n }\n ... on Review {\n url\n badge {\n url\n image\n title\n color\n }\n }\n ... on Annotation {\n url\n }\n ... on WorksRecommend {\n score\n isEditorChoice\n }\n hasPurchasedAllBadge\n }\n\n \n ... on CommentBase {\n id\n content\n commentType\n user {\n id\n ... on Agent {\n agentName\n }\n }\n ... on Discussion {\n refComment {\n id\n user {\n id\n name\n url\n ... on Agent {\n agentName\n agentId\n }\n }\n isDeleted\n createTime\n content\n }\n }\n ... on Review {\n title\n badge {\n label\n color\n }\n }\n ... on Annotation {\n \n ... on Annotation {\n originContent {\n rawTexts\n startOffset\n endOffset\n image {\n url\n size { width height }\n }\n }\n }\n\n }\n ... on WorksRecommend {\n title\n }\n ... on ReviewComment {\n refComment {\n id\n content\n createTime\n isDeleted\n user {\n id\n name\n ... on Agent {\n agentName\n agentId\n }\n }\n }\n }\n ... on AnnotationComment {\n refComment {\n id\n content\n createTime\n isDeleted\n user {\n id\n name\n ... on Agent {\n agentName\n agentId\n }\n }\n }\n }\n ... on WorksRecommendComment {\n refComment {\n id\n content\n createTime\n isDeleted\n user {\n id\n name\n ... on Agent {\n agentName\n agentId\n }\n }\n }\n }\n }\n\n \n ... on CommentBase {\n id\n commentType\n isHidden\n isDeleted\n content\n user {\n id\n name\n isBlocked\n ... on Agent {\n agentName\n }\n }\n works {\n id\n cover(useSmall: true)\n \n title\n url\n isChapter\n\n }\n operationInfo {\n editor {\n id\n name\n url\n }\n time\n }\n ... on Review {\n url\n upvoted\n upvoteCount\n commentCount\n }\n ... on Annotation {\n url\n upvoted\n upvoteCount\n commentCount\n }\n ... on Discussion {\n targetId\n upvoted\n upvoteCount\n works {\n title\n url\n }\n }\n ... on WorksRecommend {\n url\n upvoted\n upvoteCount\n commentCount\n }\n ... on ReviewComment {\n targetId\n upvoted\n upvoteCount\n }\n ... on AnnotationComment {\n targetId\n upvoted\n upvoteCount\n }\n ... on WorksRecommendComment {\n targetId\n upvoted\n upvoteCount\n }\n }\n\n \n ... on WorksRecommend {\n id\n works {\n id\n \n title\n cover(useSmall: true)\n url\n isBundle\n coverLabel(preferVip: true)\n \n \n url\n title\n\n \n author {\n name\n url\n }\n origAuthor {\n name\n url\n }\n translator {\n name\n url\n }\n\n isColumn\n isFinished\n wordCount\n wordCountUnit\n isInLibrary\n \n }\n }\n\n \n ... on CommentBase {\n id\n commentType\n content\n works {\n id\n title\n }\n user {\n id\n name\n isBlocked\n ... on Agent {\n agentName\n }\n }\n isHidden\n isDeleted\n operationInfo {\n editor {\n id\n name\n url\n }\n time\n }\n ... on Review {\n title\n rating\n reviewId\n upvoted\n upvoteCount\n commentCount\n }\n ... on Annotation {\n upvoted\n upvoteCount\n commentCount\n }\n ... on WorksRecommend {\n upvoted\n upvoteCount\n commentCount\n }\n ... on Discussion {\n targetId\n }\n ... on ReviewComment {\n targetId\n }\n ... on AnnotationComment {\n targetId\n }\n ... on WorksRecommendComment {\n targetId\n }\n }\n \n ... on CommentBase {\n id\n commentType\n content\n works {\n id\n }\n user {\n id\n name\n ... on Agent {\n agentName\n }\n }\n ... on Review {\n reviewId\n }\n }\n\n \n ... on CommentBase {\n id\n commentType\n works {\n id\n }\n ... on Review {\n reviewId\n }\n ... on Discussion {\n targetId\n }\n ... on ReviewComment {\n targetId\n }\n ... on AnnotationComment {\n targetId\n }\n ... on WorksRecommendComment {\n targetId\n }\n }\n\n\n }\n\n }\n total\n }\n }\n \n }\n }\n ",
"variables": {
"worksId": "57468782",
"start": 20,
"limit": 20,
"sort": "SCORE_DESC",
"onlyUserId": None,
"onlyCompetition": False
},
"operationName": "getWorksComment"
}
try:
data = json.dumps(data)
response = requests.post(url, headers=headers,cookies=cookies ,data=data)
time.sleep(5)
conzhen = 'content":"(.*?)","'
content = re.compile(conzhen).findall(response.text)
except:
print("本页爬取失败")
fh = open('测试写入.txt', 'a')
for i in range(1 , len(content)):
print(content[i])
fh.write(content[i] + '\n')
fh.close()