reckodruver.exe与python在同一目录下,仍然报错,火狐浏览器
代码:
import requests
from bs4 import BeautifulSoup # bs4
import re
import pandas as pd
import urllib.request
from selenium import webdriver # 模拟鼠标点击
打开Firefox浏览器,模拟器
driver = webdriver.Firefox()
driver = webdriver.Firefox("D:\python\geckodriver.exe")
第一页的网址
driver.get(url) # 模拟浏览器 解析网页
r = requests.get(url) # 请求网页
soup = BeautifulSoup(r.text, "html.parser") # bs4 解析网页
a=str(soup.find_all(name = "div",attrs = {"class":"p"})) #获取div区域下的内容解析
url= re.findall(r'<a href="(.*?)" ',a) #用re正则来获取所有页数url
page=len(url)+1 #获取页数的长度,由于正则匹配的时候,匹配到除本页以外的所有页数url不匹配,所以+1
page
content=[] #建立空的列表,存放数据
title=[]
url=[]
media=[]
time=[]
for i in range(page):
baseurl = "https://news.sogou.com/news?oq=%C1%BD%B0%B6%B9%D8%CF%B5&mode=1&manual=&stj0=8&query=%C1%BD%B0%B6%B9%D8%CF%B5%B7%A2%D5%B9%C7%B0%BE%B0&stj=8%3B0%3B0%3B0&stj2=0&stj1=0&hp=0&time=0&hp1=&sut=18021&lkt=0%2C0%2C0&ri=8&sst0=1541664088389&sort=1&page=" + str(
i) + "&w=01025001&dr=1" # 网址循环,6页的网址
# print(i,baseurl)
r = requests.get(baseurl)
soup = BeautifulSoup(r.text, "html.parser") # bs4解析网址
driver.get(baseurl)
# print(i,baseurl)
title1 = driver.find_elements_by_xpath("//h3[@class='vrTitle']/a") # 模拟鼠标点击方法,获取标题
for a in title1:
title.append(a.text)
data = str(soup.find_all(name="h3", attrs={"class": "vrTitle"})) # bs4方法,获取每篇文章的url所在区域下的网页解析
url1 = re.findall(r'<a href="(.*?)" ', data) # 正则匹配出每篇文章的url,data的格式要求是文本
for a in url1:
url.append(a)
# print(a)
for b in soup.find_all(name="p", attrs={"class": "news-from"}): # bs4方法,获取每篇文章的媒体类型和发布日期
temp = b.get_text().strip()
media.append(temp.split("\xa0")[0])
time.append(temp.split("\xa0")[1])
# print(media)
# print(time)
content1 = driver.find_elements_by_xpath("//p[@class='news-txt']/span") # 模拟鼠标点击方法,获取每篇文章的内容
for result in content1:
content.append(result.text)
# print(result.text)
df = pd.DataFrame(columns=["content", "title", "url", "media", "time"]) # 建立文档
df["content"] = content # 保存每一列数据
df["title"] = title
df["url"] = url
df['media'] = media
df["time"] = time
df.to_excel("爬虫.xlsx", index=False) # 保存excel
问题:
D:\python\python.exe D:/Doc/tem.py
Traceback (most recent call last):
File "D:\python\lib\site-packages\selenium\webdriver\common\service.py", line 76, in start
stdin=PIPE)
File "D:\python\lib\subprocess.py", line 800, in init
restore_signals, start_new_session)
File "D:\python\lib\subprocess.py", line 1207, in _execute_child
startupinfo)
FileNotFoundError: [WinError 2] 系统找不到指定的文件。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/Doc/tem.py", line 10, in
driver = webdriver.Chrome()
File "D:\python\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 73, in init
self.service.start()
File "D:\python\lib\site-packages\selenium\webdriver\common\service.py", line 83, in start
os.path.basename(self.path), self.start_error_message)
selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH.