想通过selenium模拟爬取网站数据,网站如下:
之前都代码都运行好好的可以爬取,但今天运行代码时,出现如下提示:
进不了真正想要爬取的网站,导致爬取失败,希望各位大佬可以帮忙解答,以下是我的代码:
from selenium import webdriver
from time import sleep
# 实现无可视化界面
from selenium.webdriver.chrome.options import Options
# 实现规避检测
from selenium.webdriver import ChromeOptions
def get_url(URL):
url = URL
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
driver_path = 'C:/Users/YounGQ/AppData/Local/Programs/Python/Python37/chromedriver'
# 实现无可视化界面的操作
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# 实现规避检测
option = ChromeOptions()
option.add_experimental_option("excludeSwitches", ["enable-automation"])
option.add_argument('--user-agent=%s' % user_agent)
# 如何实现让selenium规避被检测到的风险
# bro = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options, options=option)
bro = webdriver.Chrome(executable_path=driver_path)
# 无可视化界面(无头浏览器)
bro.get(url)
sleep(10)
# 点击股票
stock = bro.find_element_by_xpath('//*[@id="stocks"]')
stock.click()
sleep(5)
select = bro.find_element_by_xpath('//*[@id="stocksFilter"]')
select.click()
All_stack = bro.find_element_by_xpath('//*[@id="all"]')
All_stack.click()
sleep(10)
all_url = bro.find_element_by_xpath('//*[@class="bold left noWrap elp plusIconTd"]/a/@href')
all_company = bro.find_element_by_xpath('//*[@class="bold left noWrap elp plusIconTd"]/a')
all_url = list(all_url.text)
all_company = list(all_company.text)
date = dict(zip(all_company, all_url))
print(date)
if __name__ == "__main__":
url = 'https://cn.investing.com/markets/united-states'
get_url(url)