爬取马蜂窝的游记出错可能是cookie的逆向问题

# -*- coding:utf-8 -*-
# @Author  : 
# @file       :  test1.py
# @Time    : 2022/3/30 10:39
# @Function:

import re
import execjs
import requests
import json
from requests.utils import add_dict_to_cookiejar
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup as bs
import random
from lxml import etree
import time
import pandas as pd

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

User_Agent = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36",]


HEADERS = {
    'User-Agent': User_Agent[0],
    # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Accept-Encoding': 'gzip, deflate',

    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache'
}

targetUrl = "API"#api

resp = requests.get(targetUrl)

print(resp.status_code)
print(resp.text)
proxy_ip= resp.text
proxies = {

    "https": "http://%(proxy)s/" % {"proxy": proxy_ip},

}


# 使用session保持会话
session = requests.session()

def get_parameter(url,response):
    # 提取js代码
    print(response.text)
    js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
    # js_clearance = re.findall(r'(.*cookie=(.*?).*)', response.text)[0]
    # 执行后获得cookie参数js_clearance

    result = execjs.eval(js_clearance).split(';')[0].split('=')[1]
    # 添加cookie
    add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': result})
    # 第二次请求
    response = session.get(url, headers=HEADERS, verify=False)
    # 提取参数并转字典
    # parameter = json.loads(re.findall(r';go\((.*?)\)', response.text)[0])
    parameter = json.loads(re.findall(r'};go\((.*?)\)</script>', response.text)[0])
    js_file = ''
    # 判断加密方式
    if parameter['ha'] == 'sha1':
        js_file = 'sha1.js'
    elif parameter['ha'] == 'sha256':
        js_file = 'sha256.js'
    elif parameter['ha'] == 'md5':
        js_file = 'md5.js'
    return parameter, js_file


def get_cookie(param, file):
    with open(file, 'r') as f:
        js = f.read()
    cmp = execjs.compile(js)
    # 执行js代码传入参数
    clearance = cmp.call('go', param)
    return clearance


def run(url):
    # 第一次请求
    response = session.get(url, headers=HEADERS, proxies=proxies,verify=False)
    # 获取参数及加密方式
    parameter, js_file = get_parameter(url,response)
    # 获取cookie
    clearance = get_cookie(parameter, js_file)
    print(clearance)
    # 修改cookie
    add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': clearance})
    # 第三次请求
    html = session.get(url, headers=HEADERS, proxies=proxies,verify=False)
    # print(html.cookies)
    # print(html.content.decode())
    return html

# 从游记中获取数据
data = []
def get_data(html):

    print('Getting...')
    res_u1 = html.text
    # print(res_u)
    # res_u1 = etree.HTML(res_u)
    # print(res_u1)
    dic = {}
    n = random.randint(5, 10)
    dic['标题'] = res_u1.xpath('//h1[@class="headtext lh80"]/text()')
    dic['月份'] = res_u1.xpath('//ul/li[@class="time"]/text()[2]')
    dic['天数'] = res_u1.xpath('//ul/li[@class="day"]/text()[2]')
    dic['人物'] = res_u1.xpath('//ul/li[@class="people"]/text()[2]')
    dic['费用'] = res_u1.xpath('//ul/li[@class="cost"]/text()[2]')
    dic['内容'] = res_u1.xpath('//p[@class="_j_note_content _j_seqitem"]/text()')
    data.append(dic)
    print(data)
    print(link + '__success')
    time.sleep(n)
    return data

# 获取页面中每篇游记的链接
def get_links(res):

    # res1 = etree.HTML(res)
    # print(res1)
    page = run(res)
    soup = bs(page.text, "html.parser")
    # with open(res, "r") as ecological_pyramid:
    #     soup = bs(ecological_pyramid)
    linklist = []
    for x in soup.find_all('a'):
        link = x.get('href')
        if link:
            linklist.append(link)
    print("links :")
    print(linklist)
    return linklist


for i in range(1,3):
    print('Start_' + str(i))
    url1 = 'http://www.mafengwo.cn/yj/10807/1-0-{}.html'.format(i)
    print(url1)
    print('-' * 20)
    links = get_links(url1)
    for link in links:
        url2 = 'http://www.mafengwo.cn' + link
        print(url2)
        html1 = run(url2)
        data = get_data(html1)

df = pd.DataFrame(data)
print('=' * 20)
# 修改文件储存路径
df.to_csv('MFW_notes_3.csv', mode='a', header=True, encoding='utf_8_sig')
print('All Finished')
```9
# @Function:

报错信息
Traceback (most recent call last):
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 159, in <module>
    links = get_links(url1)
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 139, in get_links
    page = run(res)
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 99, in run
    parameter, js_file = get_parameter(url,response)
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 63, in get_parameter
    js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
IndexError: list index out of range

Process finished with exit code 1

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

2条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
CSDN专家-HGJ 2022-03-30 19:34
关注
应该是re.findall('cookie=(.*?);location', response.text)值为None,即没有匹配，所以索引越界，检查网页代码中有无相应内容，正则写得是否正确。

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

如何进行八爪鱼抓取数据爬虫
2022-05-02 21:14

回答 2 已采纳大众点评可以当作一个参考，数据算是相对多的。还有百度地图，也是可以拿到一些数据。
python 爬取马蜂窝景点翻页文字评论的实现
2020-09-18 01:55

主要介绍了python 爬取马蜂窝景点翻页文字评论的实现，文中通过示例代码介绍的非常详细，对大家的学习或者工作具有一定的参考学习价值，需要的朋友们下面随着小编来一起学习学习吧
Python爬虫爬取马蜂窝旅游景点评分、评论、评论日期（针对只能爬取五页评论做了改动）
2022-03-28 13:54

①马蜂窝景点的评论只能看到五页内容，因此按评论的主题对其进行爬取（虽然每个主题也只能看到五页，但爬取的结果总归是比五页多很多），爬取字段有景点评分、评论、评论日期。 ②以马蜂窝某景点为例，其评论高达...
Python爬取马蜂窝城市游记
2021-05-11 01:48

a_a!的博客上一篇: Python爬取马蜂窝各城市游记总数.爬取了马蜂窝所有城市的游记总数，本文爬取具体城市的游记详情 import os import time import random import pymysql import requests from lxml import etree import ...
python线程池爬取马蜂窝网站游记信息
2019-01-05 16:19

python开启十个线程的线程池爬取马蜂窝网站的数据。其中包括线程安全的设计，网页超链接去重的处理
爬取马蜂窝旅游数据，包括酒店、美食、景点的评论数据以及游记数据（2021.6.28可用）
2021-06-28 07:51

马蜂窝旅游数据，包括酒店、美食、景点的评论数据以及游记数据仅用在输入目的地或者关键词可以爬取关于目的地的所有酒店评论、美食评论、以及景点自身评论等数据以及游记数据具体数据酒店评论、美食评论、以及...
使用python爬取马蜂窝游记
2019-03-20 13:20

张景源的博客 马蜂窝是爬取2018年“南京”地区游记。观察列表，发现以时间顺序url以类似“http://www.mafengwo.cn/yj/10684/2-0-74.html"的格式。其中74代表第74页。首先爬取列表中目标url信息。 # coding=utf-8 # ...
使用Python爬取马蜂窝网站的游记和照片
2020-03-30 14:27

Lei_baobao的博客使用Python爬取马蜂窝网站的游记和照片特殊原因需要在马蜂窝上爬取一些游记和照片作为后续分析处理的数据，参考网上一些类似的爬虫文章，自己尝试了一下，这次爬取的是马蜂窝上所有有关苏州的游记（包括游记内容和...
Python爬取马蜂窝各城市游记总数
2021-05-11 00:37

a_a!的博客 Python爬取马蜂窝各城市游记总数 马蜂窝的省和直辖市都有一个对应的编号，编写程序获取编号第一步创建一个Spider的类，headers为一个字典，user-agent从浏览器中获取，使用Chrome打开任意网页，右键检查，选中...
Python 爬取马蜂窝
2020-05-19 11:12

雪急飞绪的博客 import requests import time import re import os ...poi = int(input("请输入你想要爬取的poi：")) comment_poi_url = 'http://pagelet.mafengwo.cn/poi/pagelet/poiCommentListApi?' headers...
python爬虫——爬取马蜂窝景点翻页文字评论
2020-01-19 11:51

一叶轻舟过的博客 python爬虫——爬取马蜂窝景点翻页文字评论使用Chrome、python3.7、requests库和VSCode进行爬取马蜂窝黄鹤楼的文字评论(http://www.mafengwo.cn/poi/5426285.html)。首先，我们复制一段评论，查看网页源代码，按...
python 爬取_Python爬虫爬取马蜂窝北京景点信息
2020-11-23 22:29

weixin_39849671的博客也正是最近世界杯马蜂窝的洗脑广告，想着就到马蜂窝上去爬取吧。前言本次爬虫使用python开发。自己并不是py工程师，但是还是很喜欢python这门语言的，毕竟人生苦短。所以，一直都想着把pytho...
爬取马蜂窝用户评论页面
2018-06-16 19:05

beyond_LH的博客这次先爬一下马蜂窝旅行的用户评论页面，即“蜂蜂点评”，首先进入所要爬取的页面，推荐使用谷歌浏览器，按F12显示源码信息，选中js，因为每一页的评论都是动态加载的（注意到不管点第几页浏览器的地址栏都是不变的...
马蜂窝游记数据
2019-04-29 13:07

马蜂窝游记数据包括标题封面图片游记id 内容发布时间作者作者id 作者等级阅读数收藏数分享数顶数相关目的地出发时间出行天数人物人均费用，可用于提取用户画像，旅游景点推荐等
爬取马蜂窝的全部景点的名称
2019-03-29 21:27

爬取马蜂窝的全部景点的名称
马蜂窝游记爬虫实例
2022-03-26 14:36

Infinity343的博客 js逆向问题的两种解决方法
scrapy爬取马蜂窝网站并通过Django框架展示出来.zip
2023-09-29 11:42

scrapy爬取马蜂窝网站并通过Django框架展示出来
基于python3.x的爬取马蜂窝旅游的游记照片
2018-10-13 20:50

beyond_LH的博客前几天在csdn首页看到一个...本次任务为爬取马蜂窝旅行达人的游记照片，之前搞过几次马蜂窝，中间停顿过长，再次回归吧。上代码： import re import urllib.request import os from lxml import etree def sa...
马蜂窝游记html
2019-04-12 21:47

马蜂窝游记html（未解析）需自行解析200篇游记
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 3月30日

悬赏问题

¥15 公交车和无人机协同运输
¥15 stm32代码移植没反应
¥15 matlab基于pde算法图像修复，为什么只能对示例图像有效
¥100 连续两帧图像高速减法
¥15 组策略中的计算机配置策略无法下发
¥15 如何绘制动力学系统的相图
¥15 对接wps接口实现获取元数据
¥20 给自己本科IT专业毕业的妹m找个实习工作
¥15 用友U8：向一个无法连接的网络尝试了一个套接字操作，如何解决？
¥30 我的代码按理说完成了模型的搭建、训练、验证测试等工作(标签-网络|关键词-变化检测)

爬取马蜂窝的游记出错可能是cookie的逆向问题

2条回答 默认 最新

问题事件

悬赏问题

2条回答默认最新