问了AI说可以用selenium执行js代码,用js的发请求的语法来获取,是成功的,可以在控制台打印出数据,问题是,怎么把js获取到的数据返回到我的python爬虫代码里?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common import StaleElementReferenceException, TimeoutException
from selenium.webdriver import DesiredCapabilities
import time
import requests
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random
baseUrl = 'http://xn--mnqs00c24c2pw0ii.com/miniPrograms/library/professional'
def main():
driver = openPage(baseUrl)
# 设置超时时间(单位为秒)
driver.set_script_timeout(60) # 例如,这里设置为 30 秒
goToSortPage(driver)
driver.quit()
def openPage(url):
# 初始化创建驱动器
driver = initWebDriver()
driver.get(url)
return driver
# 初始化创建驱动器
def initWebDriver():
# 创建了一个 DesiredCapabilities 对象,用于设置 Edge 驱动器的行为
desired_capabilities = DesiredCapabilities.CHROME
# 直接返回,不再等待界面加载完成
desired_capabilities["pageLoadStrategy"] = "none"
options = webdriver.ChromeOptions()
# options.add_argument('--headless') # 无界面模式
options.add_argument("--disable-images") # 禁用图片加载
# options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 创建一个 Chrome 驱动器实例,并传入 DesiredCapabilities 对象和 ChromeOptions 对象
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(3)
return driver
# 正式进入页面
def goToSortPage(driver):
# 调用登录函数,先完成登录的步骤,并获取token
token = login(driver)
print(token)
print(type(token))
# 等待搜索框出现,然后点击搜索框
inputBox = WebDriverWait(driver, 1).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='search']"))
)
inputBox.click()
subjectName = ''
subjectCode = ''
# 获取分类选择框,并点击
select1 = driver.find_elements(By.CSS_SELECTOR, ".ant-cascader-menu-item-content")
for select in select1:
print(select.text)
opt1 = input('\n请输入一级分类:\n')
for select in select1:
if opt1 in select.text:
select.click()
select2 = driver.find_elements(By.CSS_SELECTOR, ".ant-cascader-menu-item-content")
for select in select2:
print(select.text)
opt2 = input('\n请输入二级分类:\n')
for select in select2:
if opt2 in select.text:
subjectName = select.text
select.click()
select3 = driver.find_elements(By.CSS_SELECTOR, ".ant-cascader-menu-item-content")
for select in select3:
print(select.text)
opt3 = input('\n请输入三级分类:\n')
for select in select3:
if opt3 in select.text:
s = select.text
subjectCode = s.split("(")[1].split(")")[0]
select.click()
defaultButton1 = driver.find_element(By.CSS_SELECTOR, ".filter .line:nth-child(1) .content .active")
defaultButton2 = driver.find_element(By.CSS_SELECTOR, ".filter .line:nth-child(2) .content .active")
areas = driver.find_elements(By.CSS_SELECTOR, ".filter .line:nth-child(1) .content .txts .s")
schoolTypes = driver.find_elements(By.CSS_SELECTOR, ".filter .line:nth-child(2) .content .txts .s")
NineEnghtFive = schoolTypes[0]
TwoOneOne = schoolTypes[1]
Normal = schoolTypes[2]
# 选择地区
for area in areas:
print(area.text, end=" ")
while True:
# 用户输入的字符串,包括换行符\n
# 使用strip()方法来去除字符串两端的空白字符,包括换行符
areaOpt = input("\n请输入地区:1、默认全部就摁【回车】 2、【输入具体地名】\n").strip()
count = 0
for area in areas:
if areaOpt == area.text.strip():
count = 1
area.click()
# 用户按下回车键,返回的字符串也是'',而不是\n
if count == 0 and areaOpt == '':
defaultButton1.click()
break
elif count == 0 and areaOpt != '':
print("输入错误,请重新输入上面有的【城市】!")
elif count == 1:
break
# 选择院校类型
for schoolType in schoolTypes:
print(schoolType.text, end=" ")
while True:
schoolTypeOpt = input("\n请选择院校类型:1、默认全部就摁【回车】 2、985摁【2】 3、211摁【3】 4、普通高校摁【4】\n").strip()
# 用户按下回车键,返回的字符串也是'',而不是\n
if schoolTypeOpt == '':
defaultButton2.click()
break
elif schoolTypeOpt == '2':
NineEnghtFive.click()
break
elif schoolTypeOpt == '3':
TwoOneOne.click()
break
elif schoolTypeOpt == '4':
Normal.click()
break
else:
print("输入错误,请重新输入上面有的【数字】!")
# 开始搜索
# searchButton = driver.find_element(By.CSS_SELECTOR, ".searchBtn")
# searchButton.click()
# 经过我苦苦研究,他妈的终于搞明白只能这样用selenium执行js的办法来发请求获取数据了他妈的
# 定义请求参数
params = {
'limit': 10,
'page': 1,
'subjectName': '人工智能',
'subjectCode': '085410'
}
# 定义请求头,这三个参数是我在apifox试了n遍试出来的必带的数据
headers = {
'Host': 'api.feitent.com',
'Token': token,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}
# 构建请求 URL
url = "https://api.feitent.com/uapi/api/choose/subject/getInfo?" + "&".join(
[f"{key}={value}" for key, value in params.items()])
# 修正后的 JavaScript 代码,注意 {} 的转义
script = f"""
return new Promise((resolve, reject) => {{
fetch('{url}', {{
method: 'GET',
headers: {{
'Host': '{headers['Host']}',
'Token': '{headers['Token']}',
'User-Agent': '{headers['User-Agent']}'
}}
}}).then(response => {{
if (!response.ok) {{
throw new Error('Network response was not ok ' + response.statusText);
}}
return response.json();
}}).then(data => {{
console.log(data.data.list);
resolve(data.data.list); // 直接返回数据
}}).catch(error => {{
reject(error);
}});
}}
"""
# 使用 execute_async_script 执行异步 JavaScript 脚本
try:
response = driver.execute_async_script(script)
time.sleep(2)
print(response)
except TimeoutException:
print("请求超时,请检查网络连接或请求的 URL 是否正确。")
# 滑动到页面底部
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(2) # 等待两秒,不然数据还没加载出来
# hasPages = True
# schoolTotals = '0'
# try:
# pageButtonBox = WebDriverWait(driver, 1).until(
# EC.presence_of_element_located((By.CSS_SELECTOR, ".pagination .ant-pagination"))
# )
# except (StaleElementReferenceException,TimeoutException) as e:
# hasPages = False
# print("没有翻页,只有当前一页!")
#
# schoolInformations = []
# # 如果存在翻页元素,就说明不止一页数据,需要手动翻页获取所有数据
# if hasPages:
# # 先输出一下总共几条数据(多页就是在下面翻页元素那里有)
# str = driver.find_element(By.CSS_SELECTOR, ".pagination .ant-pagination .ant-pagination-total-text").text
# schoolTotals = str.split(" ")[1]
# print("数据总条数:", schoolTotals)
#
# # 然后开始循环点击每一页,并调用打印函数拿到每一页的所有数据
# while True:
# # 调用打印,把数据返回存入schoolInformations数组
# time.sleep(2) # 等待两秒,不然数据还没加载出来
# list = printSchoolsData(driver)
# schoolInformations += list
#
# # 别忘了每次下滑到最底部,不然找不到翻页元素
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# nextPage = driver.find_element(By.CSS_SELECTOR, ".ant-pagination li[title='下一页']")
# # 当“下一页”元素有“ant-pagination-disabled”这个类名的时候,就代表到最后一页了,停止循环翻页
# if 'ant-pagination-disabled' in nextPage.get_attribute('class'):
# break
# nextPage.click()
# # 如果不存在翻页元素,就说明只有一页数据,直接获取这一页全部数据即可
# else:
# # 先输出一下总共几条数据(没有翻页元素,就直接统计一页的数量)
# schools = driver.find_elements(By.CSS_SELECTOR, ".institutionItem")
# schoolTotals = len(schools)
# print("数据总条数:", schoolTotals)
#
# # 调用打印,把数据返回存入schoolInformations数组
# schoolInformations = printSchoolsData(driver)
# 调用打印函数
def printSchoolsData(driver):
schoolInformations = []
schools = driver.find_elements(By.CSS_SELECTOR, ".institutionItem")
titles_yearsChoose = driver.find_elements(By.CSS_SELECTOR, ".institutionItem .info .institutionInfo .p1")
insititutionNames = driver.find_elements(By.CSS_SELECTOR, ".institutionItem .info .institutionInfo .p2")
schoolTypes = driver.find_elements(By.CSS_SELECTOR, ".institutionItem .info .institutionInfo .p3")
for i in range(len(schools)):
schoolInformations.append({
"title": titles_yearsChoose[i].text,
"insititutionName": insititutionNames[i].text,
"schoolType": schoolTypes[i].text
})
print(f"标题:{titles_yearsChoose[i].text},学校名称:{insititutionNames[i].text},学校类型:{schoolTypes[i].text}")
return schoolInformations
def get_response(response):
# 这个函数将在 JavaScript Promise 解决后被调用
return response
def login(driver):
button = driver.find_element(By.CSS_SELECTOR, ".ant-modal-body button")
button.click()
#
# time.sleep(1)
# currentWindow = driver.current_window_handle
# allWindow = driver.window_handles
# driver.switch_to.window(allWindow[1])
# print("\n【"+driver.title+"】\n")
#
# loginButton = driver.find_element(By.XPATH, "/html/body/div[1]/span[1]/div[1]/div[5]/div/button")
# loginButton.click()
# driver.switch_to.window(currentWindow)
input('\n请先手动进行微信二维码,登录完成后摁【回车】:\n')
time.sleep(2)
token = driver.get_cookie('token')['value']
return token
# def get_proxy():
# proxys = [
# "http://127.0.0.1:7890",
# "http://116.211.143.11:80",
# "http://183.32.88.244:808",
# "http://121.40.42.35:9999",
# "http://222.94.148.210:808"
# ]
# fakepxs = {}
# fakepxs['http']= proxys[random.randint(0, len(proxys))]
# return fakepxs
def useSeleniumClickButton():
pass
# request方法已放弃了,代码舍不得删先放这
def useRequest_GetResponse():
# time.sleep(5) # 等待两秒,不然同时发起两个请求有问题
# url = "https://api.feitent.com/uapi/api/choose/subject/getInfo"
# params = {
# 'limit': 10,
# 'page': 1,
# 'subjectName': subjectName,
# 'subjectCode': subjectCode
# }
# headers = {
# 'Accept': 'application/json, text/plain, */*',
# 'Accept-Encoding': 'gzip, deflate, br, zstd',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Connection': 'keep-alive',
# 'Host': 'api.feitent.com',
# 'Origin': 'http://xn--mnqs00c24c2pw0ii.com',
# 'Referer': 'http://xn--mnqs00c24c2pw0ii.com/',
# 'Sec-Fetch-Dest': 'empty',
# 'Sec-Fetch-Mode': 'cors',
# 'Sec-Fetch-Site': 'cross-site',
# 'Sec-Ch-Ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
# 'Sec-Ch-Ua-Mobile': '?0',
# 'Sec-Ch-Ua-Platform': '"Windows"',
# 'Token': token,
# 'Unionid': 'undefined',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
# }
# # 反反爬
# proxy = {"http": get_proxy()}
#
# # 发送GET请求
# response = requests.get(url, params=params, headers=headers, proxies=proxy)
#
# # 检查响应状态码
# if response.status_code == 200:
# print(response.request.headers)
# # 处理响应内容
# data = response.json()
# print(response)
# print(response.text)
# print(data)
#
# # list_data = data['data']['list']
# # print(list_data)
# else:
# print('请求失败,状态码:', response.status_code)
pass
if __name__ == "__main__":
main()