qq_43373762 2022-03-30 19:00 采纳率: 0%
浏览 56

爬取马蜂窝的游记出错可能是cookie的逆向问题

# -*- coding:utf-8 -*-
# @Author  : 
# @file       :  test1.py
# @Time    : 2022/3/30 10:39
# @Function:

import re
import execjs
import requests
import json
from requests.utils import add_dict_to_cookiejar
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup as bs
import random
from lxml import etree
import time
import pandas as pd

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

User_Agent = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36",]


HEADERS = {
    'User-Agent': User_Agent[0],
    # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Accept-Encoding': 'gzip, deflate',

    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache'
}

targetUrl = "API"#api

resp = requests.get(targetUrl)

print(resp.status_code)
print(resp.text)
proxy_ip= resp.text
proxies = {

    "https": "http://%(proxy)s/" % {"proxy": proxy_ip},

}


# 使用session保持会话
session = requests.session()

def get_parameter(url,response):
    # 提取js代码
    print(response.text)
    js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
    # js_clearance = re.findall(r'(.*cookie=(.*?).*)', response.text)[0]
    # 执行后获得cookie参数js_clearance

    result = execjs.eval(js_clearance).split(';')[0].split('=')[1]
    # 添加cookie
    add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': result})
    # 第二次请求
    response = session.get(url, headers=HEADERS, verify=False)
    # 提取参数并转字典
    # parameter = json.loads(re.findall(r';go\((.*?)\)', response.text)[0])
    parameter = json.loads(re.findall(r'};go\((.*?)\)</script>', response.text)[0])
    js_file = ''
    # 判断加密方式
    if parameter['ha'] == 'sha1':
        js_file = 'sha1.js'
    elif parameter['ha'] == 'sha256':
        js_file = 'sha256.js'
    elif parameter['ha'] == 'md5':
        js_file = 'md5.js'
    return parameter, js_file


def get_cookie(param, file):
    with open(file, 'r') as f:
        js = f.read()
    cmp = execjs.compile(js)
    # 执行js代码传入参数
    clearance = cmp.call('go', param)
    return clearance


def run(url):
    # 第一次请求
    response = session.get(url, headers=HEADERS, proxies=proxies,verify=False)
    # 获取参数及加密方式
    parameter, js_file = get_parameter(url,response)
    # 获取cookie
    clearance = get_cookie(parameter, js_file)
    print(clearance)
    # 修改cookie
    add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': clearance})
    # 第三次请求
    html = session.get(url, headers=HEADERS, proxies=proxies,verify=False)
    # print(html.cookies)
    # print(html.content.decode())
    return html

# 从游记中获取数据
data = []
def get_data(html):

    print('Getting...')
    res_u1 = html.text
    # print(res_u)
    # res_u1 = etree.HTML(res_u)
    # print(res_u1)
    dic = {}
    n = random.randint(5, 10)
    dic['标题'] = res_u1.xpath('//h1[@class="headtext lh80"]/text()')
    dic['月份'] = res_u1.xpath('//ul/li[@class="time"]/text()[2]')
    dic['天数'] = res_u1.xpath('//ul/li[@class="day"]/text()[2]')
    dic['人物'] = res_u1.xpath('//ul/li[@class="people"]/text()[2]')
    dic['费用'] = res_u1.xpath('//ul/li[@class="cost"]/text()[2]')
    dic['内容'] = res_u1.xpath('//p[@class="_j_note_content _j_seqitem"]/text()')
    data.append(dic)
    print(data)
    print(link + '__success')
    time.sleep(n)
    return data

# 获取页面中每篇游记的链接
def get_links(res):

    # res1 = etree.HTML(res)
    # print(res1)
    page = run(res)
    soup = bs(page.text, "html.parser")
    # with open(res, "r") as ecological_pyramid:
    #     soup = bs(ecological_pyramid)
    linklist = []
    for x in soup.find_all('a'):
        link = x.get('href')
        if link:
            linklist.append(link)
    print("links :")
    print(linklist)
    return linklist


for i in range(1,3):
    print('Start_' + str(i))
    url1 = 'http://www.mafengwo.cn/yj/10807/1-0-{}.html'.format(i)
    print(url1)
    print('-' * 20)
    links = get_links(url1)
    for link in links:
        url2 = 'http://www.mafengwo.cn' + link
        print(url2)
        html1 = run(url2)
        data = get_data(html1)

df = pd.DataFrame(data)
print('=' * 20)
# 修改文件储存路径
df.to_csv('MFW_notes_3.csv', mode='a', header=True, encoding='utf_8_sig')
print('All Finished')
```9
# @Function:

报错信息
Traceback (most recent call last):
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 159, in <module>
    links = get_links(url1)
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 139, in get_links
    page = run(res)
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 99, in run
    parameter, js_file = get_parameter(url,response)
  File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 63, in get_parameter
    js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
IndexError: list index out of range

Process finished with exit code 1
  • 写回答

2条回答 默认 最新

  • CSDN专家-HGJ 2022-03-30 19:34
    关注

    应该是re.findall('cookie=(.*?);location', response.text)值为None,即没有匹配,所以索引越界,检查网页代码中有无相应内容,正则写得是否正确。

    评论

报告相同问题?

问题事件

  • 创建了问题 3月30日

悬赏问题

  • ¥15 公交车和无人机协同运输
  • ¥15 stm32代码移植没反应
  • ¥15 matlab基于pde算法图像修复,为什么只能对示例图像有效
  • ¥100 连续两帧图像高速减法
  • ¥15 组策略中的计算机配置策略无法下发
  • ¥15 如何绘制动力学系统的相图
  • ¥15 对接wps接口实现获取元数据
  • ¥20 给自己本科IT专业毕业的妹m找个实习工作
  • ¥15 用友U8:向一个无法连接的网络尝试了一个套接字操作,如何解决?
  • ¥30 我的代码按理说完成了模型的搭建、训练、验证测试等工作(标签-网络|关键词-变化检测)