qq_40006118 2022-01-04 16:58 采纳率: 77.3%
浏览 95
已结题

用scrapy爬到一半时停止并报错invalid session id

用scrapy做了一个爬虫,总共爬了三次,并同时在三个网站爬,总共四五千个数据,每次都有一个网站在其他的爬完之后突然停止并且报错

selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id

每次停止的位置也不一样,网上说这个错误是webdriver在关闭后再次调用才出现,但我觉着不像是代码问题,不然每次爬取结果不该不同啊
希望有人解答
同时附上middleware文件

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from selenium import  webdriver
import time
import scrapy
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class JobHuntingSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class JobHuntingDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def __init__(cls):
        cls.BUPT_driver = webdriver.Chrome()
        cls.UESTC_driver = webdriver.Chrome()
        cls.XIDIAN_driver = webdriver.Chrome()

    @classmethod
    def __del__(cls):
        cls.BUPT_driver.close()
        cls.UESTC_driver.close()
        cls.XIDIAN_driver.close()

    @classmethod
    def get_BUPT_driver(cls):
        return cls.BUPT_driver

    @classmethod
    def get_UESTC_driver(cls):
        return cls.UESTC_driver

    @classmethod
    def get_XIDIAN_driver(cls):
        return cls.XIDIAN_driver

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        if spider.name=="UESTC":
            self.UESTC_driver.get(request.url)
            time.sleep(2)
            return scrapy.http.HtmlResponse(url=request.url,body=self.UESTC_driver.page_source.encode('utf-8'),encoding='utf-8',request=request,status=200)
        elif spider.name=="XIDIAN":
            self.XIDIAN_driver.get(request.url)
            time.sleep(2)
            return scrapy.http.HtmlResponse(url=request.url,body=self.XIDIAN_driver.page_source.encode('utf-8'),encoding='utf-8',request=request,status=200)
        elif spider.name=="BUPT":
            self.BUPT_driver.get(request.url)
            time.sleep(2)
            return scrapy.http.HtmlResponse(url=request.url,body=self.BUPT_driver.page_source.encode('utf-8'),encoding='utf-8',request=request,status=200)

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


和出问题的文件代码

import scrapy
from job_hunting.items import JobHuntingItem
from job_hunting.middlewares import JobHuntingDownloaderMiddleware
import time
from datetime import datetime

class mySpider(scrapy.spiders.Spider):
    name="XIDIAN"
    allow_domains=['job.xidian.edu.cn']
    start_urls = ["https://job.xidian.edu.cn/campus/index?domain=xidian&city=&page=1"]

    xidian_next_page = ''
    def parse(self, response):
        item=JobHuntingItem()
        next_page_href = response.css('li[class="next"]>a::attr(href)').extract()
        last_page_href = response.css('li[class="last"]>a::attr(href)').extract()
        if next_page_href != last_page_href:
            self.xidian_next_page = 'https://job.xidian.edu.cn' + next_page_href[0]
        else:
            self.xidian_next_page = ''
        c_page_url_list = response.css('ul[class="infoList"]>li:nth-child(1)>a')
        for job in c_page_url_list:
            driver = JobHuntingDownloaderMiddleware.get_XIDIAN_driver()
            driver.get('https://job.xidian.edu.cn' + job.css('a::attr(href)').extract()[0])
            time.sleep(2)
            item['job_title'] = [driver.find_element('css selector', 'div[class="info-left"]>div>h5').text]
            date_text = driver.find_element('css selector', 'div[class="share"]>ul>li:nth-child(1)').text
            date_text = date_text[date_text.find(':') + 1:]
            if datetime.strptime(date_text,'%Y-%m-%d %H:%M')<datetime.strptime('2021-09-01 00:00','%Y-%m-%d %H:%M'):
                self.xidian_next_page = ''
                break
            item['job_date'] = [date_text]
            views_text = driver.find_element('css selector', 'div[class="share"]>ul>li:nth-child(2)').text
            item['job_views'] = [views_text[views_text.find(':') + 1:]]
            item['job_number']=['None']
            yield item
        if self.xidian_next_page != '':
            yield scrapy.Request(self.xidian_next_page, callback=self.parse)

  • 写回答

1条回答 默认 最新

  • 晴泪 2022-01-06 18:53
    关注

    这位博友情况跟你的有点类似,你可以借鉴一下 https://blog.csdn.net/weixin_35757704/article/details/120706276

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论 编辑记录

报告相同问题?

问题事件

  • 系统已结题 3月1日
  • 已采纳回答 2月21日
  • 创建了问题 1月4日

悬赏问题

  • ¥15 执行 virtuoso 命令后,界面没有,cadence 启动不起来
  • ¥50 comfyui下连接animatediff节点生成视频质量非常差的原因
  • ¥20 有关区间dp的问题求解
  • ¥15 多电路系统共用电源的串扰问题
  • ¥15 slam rangenet++配置
  • ¥15 有没有研究水声通信方面的帮我改俩matlab代码
  • ¥15 ubuntu子系统密码忘记
  • ¥15 保护模式-系统加载-段寄存器
  • ¥15 电脑桌面设定一个区域禁止鼠标操作
  • ¥15 求NPF226060磁芯的详细资料