# -*- coding:utf-8 -*-
# @Author :
# @file : test1.py
# @Time : 2022/3/30 10:39
# @Function:
import re
import execjs
import requests
import json
from requests.utils import add_dict_to_cookiejar
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup as bs
import random
from lxml import etree
import time
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
User_Agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36",]
HEADERS = {
'User-Agent': User_Agent[0],
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}
targetUrl = "API"#api
resp = requests.get(targetUrl)
print(resp.status_code)
print(resp.text)
proxy_ip= resp.text
proxies = {
"https": "http://%(proxy)s/" % {"proxy": proxy_ip},
}
# 使用session保持会话
session = requests.session()
def get_parameter(url,response):
# 提取js代码
print(response.text)
js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
# js_clearance = re.findall(r'(.*cookie=(.*?).*)', response.text)[0]
# 执行后获得cookie参数js_clearance
result = execjs.eval(js_clearance).split(';')[0].split('=')[1]
# 添加cookie
add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': result})
# 第二次请求
response = session.get(url, headers=HEADERS, verify=False)
# 提取参数并转字典
# parameter = json.loads(re.findall(r';go\((.*?)\)', response.text)[0])
parameter = json.loads(re.findall(r'};go\((.*?)\)</script>', response.text)[0])
js_file = ''
# 判断加密方式
if parameter['ha'] == 'sha1':
js_file = 'sha1.js'
elif parameter['ha'] == 'sha256':
js_file = 'sha256.js'
elif parameter['ha'] == 'md5':
js_file = 'md5.js'
return parameter, js_file
def get_cookie(param, file):
with open(file, 'r') as f:
js = f.read()
cmp = execjs.compile(js)
# 执行js代码传入参数
clearance = cmp.call('go', param)
return clearance
def run(url):
# 第一次请求
response = session.get(url, headers=HEADERS, proxies=proxies,verify=False)
# 获取参数及加密方式
parameter, js_file = get_parameter(url,response)
# 获取cookie
clearance = get_cookie(parameter, js_file)
print(clearance)
# 修改cookie
add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': clearance})
# 第三次请求
html = session.get(url, headers=HEADERS, proxies=proxies,verify=False)
# print(html.cookies)
# print(html.content.decode())
return html
# 从游记中获取数据
data = []
def get_data(html):
print('Getting...')
res_u1 = html.text
# print(res_u)
# res_u1 = etree.HTML(res_u)
# print(res_u1)
dic = {}
n = random.randint(5, 10)
dic['标题'] = res_u1.xpath('//h1[@class="headtext lh80"]/text()')
dic['月份'] = res_u1.xpath('//ul/li[@class="time"]/text()[2]')
dic['天数'] = res_u1.xpath('//ul/li[@class="day"]/text()[2]')
dic['人物'] = res_u1.xpath('//ul/li[@class="people"]/text()[2]')
dic['费用'] = res_u1.xpath('//ul/li[@class="cost"]/text()[2]')
dic['内容'] = res_u1.xpath('//p[@class="_j_note_content _j_seqitem"]/text()')
data.append(dic)
print(data)
print(link + '__success')
time.sleep(n)
return data
# 获取页面中每篇游记的链接
def get_links(res):
# res1 = etree.HTML(res)
# print(res1)
page = run(res)
soup = bs(page.text, "html.parser")
# with open(res, "r") as ecological_pyramid:
# soup = bs(ecological_pyramid)
linklist = []
for x in soup.find_all('a'):
link = x.get('href')
if link:
linklist.append(link)
print("links :")
print(linklist)
return linklist
for i in range(1,3):
print('Start_' + str(i))
url1 = 'http://www.mafengwo.cn/yj/10807/1-0-{}.html'.format(i)
print(url1)
print('-' * 20)
links = get_links(url1)
for link in links:
url2 = 'http://www.mafengwo.cn' + link
print(url2)
html1 = run(url2)
data = get_data(html1)
df = pd.DataFrame(data)
print('=' * 20)
# 修改文件储存路径
df.to_csv('MFW_notes_3.csv', mode='a', header=True, encoding='utf_8_sig')
print('All Finished')
```9
# @Function:
报错信息
Traceback (most recent call last):
File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 159, in <module>
links = get_links(url1)
File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 139, in get_links
page = run(res)
File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 99, in run
parameter, js_file = get_parameter(url,response)
File "C:/programming/pycharm/PyCharm Community Edition 2021.2.2/M_learning/statistics/test1.py", line 63, in get_parameter
js_clearance = re.findall('cookie=(.*?);location', response.text)[0]
IndexError: list index out of range
Process finished with exit code 1
爬取马蜂窝的游记出错可能是cookie的逆向问题
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
2条回答 默认 最新
- CSDN专家-HGJ 2022-03-30 19:34关注
应该是re.findall('cookie=(.*?);location', response.text)值为None,即没有匹配,所以索引越界,检查网页代码中有无相应内容,正则写得是否正确。
解决 无用评论 打赏 举报
悬赏问题
- ¥15 公交车和无人机协同运输
- ¥15 stm32代码移植没反应
- ¥15 matlab基于pde算法图像修复,为什么只能对示例图像有效
- ¥100 连续两帧图像高速减法
- ¥15 组策略中的计算机配置策略无法下发
- ¥15 如何绘制动力学系统的相图
- ¥15 对接wps接口实现获取元数据
- ¥20 给自己本科IT专业毕业的妹m找个实习工作
- ¥15 用友U8:向一个无法连接的网络尝试了一个套接字操作,如何解决?
- ¥30 我的代码按理说完成了模型的搭建、训练、验证测试等工作(标签-网络|关键词-变化检测)