用scrapy做了一个爬虫,总共爬了三次,并同时在三个网站爬,总共四五千个数据,每次都有一个网站在其他的爬完之后突然停止并且报错
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id
每次停止的位置也不一样,网上说这个错误是webdriver在关闭后再次调用才出现,但我觉着不像是代码问题,不然每次爬取结果不该不同啊
希望有人解答
同时附上middleware文件
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from selenium import webdriver
import time
import scrapy
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class JobHuntingSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class JobHuntingDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def __init__(cls):
cls.BUPT_driver = webdriver.Chrome()
cls.UESTC_driver = webdriver.Chrome()
cls.XIDIAN_driver = webdriver.Chrome()
@classmethod
def __del__(cls):
cls.BUPT_driver.close()
cls.UESTC_driver.close()
cls.XIDIAN_driver.close()
@classmethod
def get_BUPT_driver(cls):
return cls.BUPT_driver
@classmethod
def get_UESTC_driver(cls):
return cls.UESTC_driver
@classmethod
def get_XIDIAN_driver(cls):
return cls.XIDIAN_driver
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
if spider.name=="UESTC":
self.UESTC_driver.get(request.url)
time.sleep(2)
return scrapy.http.HtmlResponse(url=request.url,body=self.UESTC_driver.page_source.encode('utf-8'),encoding='utf-8',request=request,status=200)
elif spider.name=="XIDIAN":
self.XIDIAN_driver.get(request.url)
time.sleep(2)
return scrapy.http.HtmlResponse(url=request.url,body=self.XIDIAN_driver.page_source.encode('utf-8'),encoding='utf-8',request=request,status=200)
elif spider.name=="BUPT":
self.BUPT_driver.get(request.url)
time.sleep(2)
return scrapy.http.HtmlResponse(url=request.url,body=self.BUPT_driver.page_source.encode('utf-8'),encoding='utf-8',request=request,status=200)
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
和出问题的文件代码
import scrapy
from job_hunting.items import JobHuntingItem
from job_hunting.middlewares import JobHuntingDownloaderMiddleware
import time
from datetime import datetime
class mySpider(scrapy.spiders.Spider):
name="XIDIAN"
allow_domains=['job.xidian.edu.cn']
start_urls = ["https://job.xidian.edu.cn/campus/index?domain=xidian&city=&page=1"]
xidian_next_page = ''
def parse(self, response):
item=JobHuntingItem()
next_page_href = response.css('li[class="next"]>a::attr(href)').extract()
last_page_href = response.css('li[class="last"]>a::attr(href)').extract()
if next_page_href != last_page_href:
self.xidian_next_page = 'https://job.xidian.edu.cn' + next_page_href[0]
else:
self.xidian_next_page = ''
c_page_url_list = response.css('ul[class="infoList"]>li:nth-child(1)>a')
for job in c_page_url_list:
driver = JobHuntingDownloaderMiddleware.get_XIDIAN_driver()
driver.get('https://job.xidian.edu.cn' + job.css('a::attr(href)').extract()[0])
time.sleep(2)
item['job_title'] = [driver.find_element('css selector', 'div[class="info-left"]>div>h5').text]
date_text = driver.find_element('css selector', 'div[class="share"]>ul>li:nth-child(1)').text
date_text = date_text[date_text.find(':') + 1:]
if datetime.strptime(date_text,'%Y-%m-%d %H:%M')<datetime.strptime('2021-09-01 00:00','%Y-%m-%d %H:%M'):
self.xidian_next_page = ''
break
item['job_date'] = [date_text]
views_text = driver.find_element('css selector', 'div[class="share"]>ul>li:nth-child(2)').text
item['job_views'] = [views_text[views_text.find(':') + 1:]]
item['job_number']=['None']
yield item
if self.xidian_next_page != '':
yield scrapy.Request(self.xidian_next_page, callback=self.parse)