岑梓铭 2024-12-16 20:53 采纳率: 66.7%
浏览 20

selenium解决js请求来爬虫的案例,宇宙级超级世界难题!无人能解!

我在爬一个叫HaoYo的考研数据网站:http://xn--mnqs00c24c2pw0ii.com/miniPrograms/library/professional%EF%BC%8C%E8%BF%90%E7%94%A8%E7%9A%84%E6%98%AFpython%E7%9A%84selenium%E5%92%8Crequest%EF%BC%8C%E7%99%BB%E5%BD%95%E9%9C%80%E8%A6%81%E6%89%8B%E5%8A%A8%E5%BE%AE%E4%BF%A1%E6%89%AB%E7%A0%81%E5%85%88%EF%BC%8C%E6%89%AB%E5%AE%8C%E7%A0%81%E5%8F%AF%E4%BB%A5%E5%9B%9E%E5%88%B0python%E6%8E%A7%E5%88%B6%E5%8F%B0%E6%91%81%E4%B8%8B%E5%9B%9E%E8%BD%A6%EF%BC%8C%E7%84%B6%E5%90%8E%E6%A0%B9%E6%8D%AE%E6%8C%87%E7%A4%BA%E5%BE%80%E4%B8%8B%E8%B5%B0%EF%BC%8C%E4%BD%86%E6%98%AF%E5%AE%83%E6%9F%A5%E5%88%B0%E7%9A%84%E5%BD%95%E5%8F%96%E5%88%86%E6%95%B0%E6%95%B0%E6%8D%AE%E6%98%AFCanvas%E5%9B%BE%EF%BC%81%E4%BA%8E%E6%98%AF%E6%88%91%E5%8F%AA%E8%83%BD%E5%AF%84%E5%B8%8C%E6%9C%9B%E4%BA%8E%E8%8E%B7%E5%8F%96%E8%AF%B7%E6%B1%82%E4%BD%93%E9%87%8C%E7%9A%84%E4%BF%A1%E6%81%AF

问了AI说可以用selenium执行js代码,用js的发请求的语法来获取,是成功的,可以在控制台打印出数据,问题是,怎么把js获取到的数据返回到我的python爬虫代码里?

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common import StaleElementReferenceException, TimeoutException
from selenium.webdriver import DesiredCapabilities
import time
import requests
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random
 
baseUrl = 'http://xn--mnqs00c24c2pw0ii.com/miniPrograms/library/professional'
 
def main():
    driver = openPage(baseUrl)
    # 设置超时时间(单位为秒)
    driver.set_script_timeout(60)  # 例如,这里设置为 30 秒
    goToSortPage(driver)
    driver.quit()
 
def openPage(url):
    # 初始化创建驱动器
    driver = initWebDriver()
    driver.get(url)
    return driver
 
# 初始化创建驱动器
def initWebDriver():
    # 创建了一个 DesiredCapabilities 对象,用于设置 Edge 驱动器的行为
    desired_capabilities = DesiredCapabilities.CHROME
    # 直接返回,不再等待界面加载完成
    desired_capabilities["pageLoadStrategy"] = "none"
 
    options = webdriver.ChromeOptions()
    # options.add_argument('--headless')  # 无界面模式
    options.add_argument("--disable-images")  # 禁用图片加载
    # options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
 
    # 创建一个 Chrome 驱动器实例,并传入 DesiredCapabilities 对象和 ChromeOptions 对象
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(3)
    return driver
 
# 正式进入页面
def goToSortPage(driver):
    # 调用登录函数,先完成登录的步骤,并获取token
    token = login(driver)
    print(token)
    print(type(token))
 
    # 等待搜索框出现,然后点击搜索框
    inputBox = WebDriverWait(driver, 1).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='search']"))
    )
    inputBox.click()
 
    subjectName = ''
    subjectCode = ''
    # 获取分类选择框,并点击
    select1 = driver.find_elements(By.CSS_SELECTOR, ".ant-cascader-menu-item-content")
    for select in select1:
        print(select.text)
    opt1 = input('\n请输入一级分类:\n')
    for select in select1:
        if opt1 in select.text:
            select.click()
 
    select2 = driver.find_elements(By.CSS_SELECTOR, ".ant-cascader-menu-item-content")
    for select in select2:
        print(select.text)
    opt2 = input('\n请输入二级分类:\n')
    for select in select2:
        if opt2 in select.text:
            subjectName = select.text
            select.click()
 
    select3 = driver.find_elements(By.CSS_SELECTOR, ".ant-cascader-menu-item-content")
    for select in select3:
        print(select.text)
    opt3 = input('\n请输入三级分类:\n')
    for select in select3:
        if opt3 in select.text:
            s = select.text
            subjectCode = s.split("(")[1].split(")")[0]
            select.click()
 
    defaultButton1 = driver.find_element(By.CSS_SELECTOR, ".filter .line:nth-child(1) .content .active")
    defaultButton2 = driver.find_element(By.CSS_SELECTOR, ".filter .line:nth-child(2) .content .active")
    areas = driver.find_elements(By.CSS_SELECTOR, ".filter .line:nth-child(1) .content .txts .s")
    schoolTypes = driver.find_elements(By.CSS_SELECTOR, ".filter .line:nth-child(2) .content .txts .s")
    NineEnghtFive = schoolTypes[0]
    TwoOneOne = schoolTypes[1]
    Normal = schoolTypes[2]
 
    # 选择地区
    for area in areas:
        print(area.text, end="  ")
    while True:
        # 用户输入的字符串,包括换行符\n
        # 使用strip()方法来去除字符串两端的空白字符,包括换行符
        areaOpt = input("\n请输入地区:1、默认全部就摁【回车】   2、【输入具体地名】\n").strip()
        count = 0
        for area in areas:
            if areaOpt == area.text.strip():
                count = 1
                area.click()
        # 用户按下回车键,返回的字符串也是'',而不是\n
        if count == 0 and areaOpt == '':
            defaultButton1.click()
            break
        elif count == 0 and areaOpt != '':
            print("输入错误,请重新输入上面有的【城市】!")
        elif count == 1:
            break
    # 选择院校类型
    for schoolType in schoolTypes:
        print(schoolType.text, end="  ")
    while True:
        schoolTypeOpt = input("\n请选择院校类型:1、默认全部就摁【回车】   2、985摁【2】     3、211摁【3】   4、普通高校摁【4】\n").strip()
        # 用户按下回车键,返回的字符串也是'',而不是\n
        if schoolTypeOpt == '':
            defaultButton2.click()
            break
        elif schoolTypeOpt == '2':
            NineEnghtFive.click()
            break
        elif schoolTypeOpt == '3':
            TwoOneOne.click()
            break
        elif schoolTypeOpt == '4':
            Normal.click()
            break
        else:
            print("输入错误,请重新输入上面有的【数字】!")
 
    # 开始搜索
    # searchButton = driver.find_element(By.CSS_SELECTOR, ".searchBtn")
    # searchButton.click()
 
    # 经过我苦苦研究,他妈的终于搞明白只能这样用selenium执行js的办法来发请求获取数据了他妈的
    # 定义请求参数
    params = {
        'limit': 10,
        'page': 1,
        'subjectName': '人工智能',
        'subjectCode': '085410'
    }
    # 定义请求头,这三个参数是我在apifox试了n遍试出来的必带的数据
    headers = {
        'Host': 'api.feitent.com',
        'Token': token,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    }
 
    # 构建请求 URL
    url = "https://api.feitent.com/uapi/api/choose/subject/getInfo?" + "&".join(
        [f"{key}={value}" for key, value in params.items()])
 
    # 修正后的 JavaScript 代码,注意 {} 的转义
    script = f"""
        return new Promise((resolve, reject) => {{
            fetch('{url}', {{
                method: 'GET',
                headers: {{
                    'Host': '{headers['Host']}',
                    'Token': '{headers['Token']}',
                    'User-Agent': '{headers['User-Agent']}'
                }}
            }}).then(response => {{
                if (!response.ok) {{
                    throw new Error('Network response was not ok ' + response.statusText);
                }}
                return response.json();
            }}).then(data => {{
                console.log(data.data.list);
                resolve(data.data.list); // 直接返回数据
            }}).catch(error => {{
                reject(error);
            }});
        }}
    """
 
    # 使用 execute_async_script 执行异步 JavaScript 脚本
    try:
        response = driver.execute_async_script(script)
        time.sleep(2)
        print(response)
    except TimeoutException:
        print("请求超时,请检查网络连接或请求的 URL 是否正确。")
 
    # 滑动到页面底部
    # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # time.sleep(2) # 等待两秒,不然数据还没加载出来
    # hasPages = True
    # schoolTotals = '0'
    # try:
    #     pageButtonBox = WebDriverWait(driver, 1).until(
    #         EC.presence_of_element_located((By.CSS_SELECTOR, ".pagination .ant-pagination"))
    #     )
    # except (StaleElementReferenceException,TimeoutException) as e:
    #     hasPages = False
    #     print("没有翻页,只有当前一页!")
    #
    # schoolInformations = []
    # # 如果存在翻页元素,就说明不止一页数据,需要手动翻页获取所有数据
    # if hasPages:
    #     # 先输出一下总共几条数据(多页就是在下面翻页元素那里有)
    #     str = driver.find_element(By.CSS_SELECTOR, ".pagination .ant-pagination .ant-pagination-total-text").text
    #     schoolTotals = str.split(" ")[1]
    #     print("数据总条数:", schoolTotals)
    #
    #     # 然后开始循环点击每一页,并调用打印函数拿到每一页的所有数据
    #     while True:
    #         # 调用打印,把数据返回存入schoolInformations数组
    #         time.sleep(2) # 等待两秒,不然数据还没加载出来
    #         list = printSchoolsData(driver)
    #         schoolInformations += list
    #
    #         # 别忘了每次下滑到最底部,不然找不到翻页元素
    #         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #         nextPage = driver.find_element(By.CSS_SELECTOR, ".ant-pagination li[title='下一页']")
    #         # 当“下一页”元素有“ant-pagination-disabled”这个类名的时候,就代表到最后一页了,停止循环翻页
    #         if 'ant-pagination-disabled' in nextPage.get_attribute('class'):
    #             break
    #         nextPage.click()
    # # 如果不存在翻页元素,就说明只有一页数据,直接获取这一页全部数据即可
    # else:
    #     # 先输出一下总共几条数据(没有翻页元素,就直接统计一页的数量)
    #     schools = driver.find_elements(By.CSS_SELECTOR, ".institutionItem")
    #     schoolTotals = len(schools)
    #     print("数据总条数:", schoolTotals)
    #
    #     # 调用打印,把数据返回存入schoolInformations数组
    #     schoolInformations = printSchoolsData(driver)
 
# 调用打印函数
def printSchoolsData(driver):
    schoolInformations = []
    schools = driver.find_elements(By.CSS_SELECTOR, ".institutionItem")
    titles_yearsChoose = driver.find_elements(By.CSS_SELECTOR,  ".institutionItem  .info .institutionInfo .p1")
    insititutionNames = driver.find_elements(By.CSS_SELECTOR, ".institutionItem  .info .institutionInfo .p2")
    schoolTypes = driver.find_elements(By.CSS_SELECTOR, ".institutionItem  .info .institutionInfo .p3")
 
    for i in range(len(schools)):
        schoolInformations.append({
            "title": titles_yearsChoose[i].text,
            "insititutionName": insititutionNames[i].text,
            "schoolType": schoolTypes[i].text
        })
        print(f"标题:{titles_yearsChoose[i].text},学校名称:{insititutionNames[i].text},学校类型:{schoolTypes[i].text}")
    return schoolInformations
 
def get_response(response):
    # 这个函数将在 JavaScript Promise 解决后被调用
    return response
 
 
def login(driver):
    button = driver.find_element(By.CSS_SELECTOR, ".ant-modal-body button")
    button.click()
    #
    # time.sleep(1)
    # currentWindow = driver.current_window_handle
    # allWindow = driver.window_handles
    # driver.switch_to.window(allWindow[1])
    # print("\n【"+driver.title+"】\n")
    #
    # loginButton = driver.find_element(By.XPATH, "/html/body/div[1]/span[1]/div[1]/div[5]/div/button")
    # loginButton.click()
    # driver.switch_to.window(currentWindow)
 
    input('\n请先手动进行微信二维码,登录完成后摁【回车】:\n')
    time.sleep(2)
 
    token = driver.get_cookie('token')['value']
    return token
 
 
# def get_proxy():
#     proxys = [
#         "http://127.0.0.1:7890",
#         "http://116.211.143.11:80",
#         "http://183.32.88.244:808",
#         "http://121.40.42.35:9999",
#         "http://222.94.148.210:808"
#     ]
#     fakepxs = {}
#     fakepxs['http']= proxys[random.randint(0, len(proxys))]
#     return fakepxs
 
def useSeleniumClickButton():
    pass
 
# request方法已放弃了,代码舍不得删先放这
def useRequest_GetResponse():
    # time.sleep(5)  # 等待两秒,不然同时发起两个请求有问题
    # url = "https://api.feitent.com/uapi/api/choose/subject/getInfo"
    # params = {
    #     'limit': 10,
    #     'page': 1,
    #     'subjectName': subjectName,
    #     'subjectCode': subjectCode
    # }
    # headers = {
    #     'Accept': 'application/json, text/plain, */*',
    #     'Accept-Encoding': 'gzip, deflate, br, zstd',
    #     'Accept-Language': 'zh-CN,zh;q=0.9',
    #     'Connection': 'keep-alive',
    #     'Host': 'api.feitent.com',
    #     'Origin': 'http://xn--mnqs00c24c2pw0ii.com',
    #     'Referer': 'http://xn--mnqs00c24c2pw0ii.com/',
    #     'Sec-Fetch-Dest': 'empty',
    #     'Sec-Fetch-Mode': 'cors',
    #     'Sec-Fetch-Site': 'cross-site',
    #     'Sec-Ch-Ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
    #     'Sec-Ch-Ua-Mobile': '?0',
    #     'Sec-Ch-Ua-Platform': '"Windows"',
    #     'Token': token,
    #     'Unionid': 'undefined',
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    # }
    # # 反反爬
    # proxy = {"http": get_proxy()}
    #
    # # 发送GET请求
    # response = requests.get(url, params=params, headers=headers, proxies=proxy)
    #
    # # 检查响应状态码
    # if response.status_code == 200:
    #     print(response.request.headers)
    #     # 处理响应内容
    #     data = response.json()
    #     print(response)
    #     print(response.text)
    #     print(data)
    #
    #     # list_data = data['data']['list']
    #     # print(list_data)
    # else:
    #     print('请求失败,状态码:', response.status_code)
    pass
 
if __name__ == "__main__":
    main()

  • 写回答

1条回答 默认 最新

  • 关注

    以下回复参考:皆我百晓生券券喵儿等免费微信小程序作答:

    你的问题是如何将使用Selenium执行的JavaScript代码获取的数据返回到Python中。你已经知道可以使用driver.execute_async_script()来执行JavaScript代码并获取其返回的结果。你需要做的是将你的JavaScript代码(已经嵌入在Python代码中)稍作修改,使其返回一个Promise,这样你就可以在Python中等待这个Promise解决并获取其结果。

    你的代码中已经有一个get_response函数,它将在JavaScript Promise解决后被调用。你可以在这个函数中处理数据并将其返回给Python。修改后的代码如下:

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    import time
    
    # ... 其他代码保持不变 ...
    
    def get_data_from_js():
        # 定义请求参数
        params = {
            'limit': 10,
            'page': 1,
            'subjectName': '人工智能',
            'subjectCode': '085410'
        }
        # 定义请求头
        headers = {
            'Host': 'api.feitent.com',
            'Token': token,  # 这个token需要从其他地方获取,例如登录后的cookie等
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
        }
        
        # 使用execute_async_script执行异步JavaScript脚本,并等待Promise解决
        try:
            response = driver.execute_async_script(get_response, params, headers)  # 将get_response和参数传给JavaScript代码
            return response  # 返回Python中处理的数据
        except TimeoutException:
            print("请求超时,请检查网络连接或请求的URL是否正确。")
            return None  # 如果请求超时,返回None或其他错误标识
    
    if __name__ == "__main__":
        driver = openPage(baseUrl)
        data = get_data_from_js()  # 获取数据
        print(data)  # 打印获取的数据
    

    请注意,这里假设你的JavaScript代码中的get_response函数会返回一个Promise,并且这个Promise解决后会返回一个可以被Python处理的数据结构(例如JSON)。此外,请确保你的Selenium driver已经正确安装并配置好了。希望这个解答能帮到你解决问题!

    评论

报告相同问题?

问题事件

  • 创建了问题 12月16日