问题描述:
通过csv文件获得笔记本电脑的id,然后构建其完整的url地址,然后获得电脑的详情页面:
但是在运行过程中,刚开始正常运行,过一会selenium控制的浏览器就卡在一个电脑的详情页面不动了,在selenium控制的浏览器页面手动刷新就又可以正常跳转到接下啦的电脑详情页抓取信息,之后过一段时间,就又会卡住
这是什么原因啊?求大佬解答
ps:加了超时就刷新(2次),还是没解决这个问题
下载器中间件代码
from selenium.common.exceptions import TimeoutException
from scrapy.http.response.html import HtmlResponse
from time import sleep
class JD_Spider_MiddleWare(object):
def process_request(self, request, spider):
return None
class JD_spider_MiddleWare_return(object):
def process_response(self, request, response, spider):
try:
spider.browser.get(request.url)
except TimeoutException as t1: # 刷新页面,解决timeout的问题,若刷新后无法解决,则丢弃
print("1连接超时:+ {}".format(t1))
print('尝试重新连接......')
try:
spider.browser.refresh()
except TimeoutException as t2:
print("连接超时2次,将其抛弃,第二次出错信息:+ {}".format(t2))
except Exception as e:
print("页面无响应{}".format(e))
else:
print('重新连接成功!')
except Exception as e:
print("页面无响应+{}".format(e))
if spider == 'jd':
target = spider.browser.find_element_by_xpath("//div[@id='J_promGoodsWrap_292']") # 定位下滑到的元素位置
spider.browser.execute_script("arguments[0].scrollIntoView();", target) # 拖动到下一页的位置
sleep(20)
return HtmlResponse(url=request.url, body=spider.browser.page_source, request=request, encoding='utf-8')
spider代码
import scrapy
import pandas
from JD_Computer_Spider.items import goods_detail
from selenium import webdriver
class computer_detail(scrapy.Spider):
name = 'detail'
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument('--log-level=3')
self.browser = webdriver.Chrome(options=options)
self.browser.maximize_window() # 最大化窗口
def start_requests(self):
urls = []
data = pandas.read_csv(r'./brands_computers.csv')['goods_id']
for id in data:
if len(id) < 100:
goods_url = 'https://item.jd.com/' + str(id) + '.html'
print(goods_url)
urls.append(goods_url)
yield scrapy.Request(url=goods_url, callback=self.parse, meta={'item': id})
def parse(self, response):
gd = goods_detail()
gd['goods_id'] = response.meta['item']
gd['goods_name'] = response.xpath("//div[@class='itemInfo-wrap/div[1]/text()']").extract_first()
gd['goods_price'] = response.xpath("//span[@class='p-price']/span[2]/text()").extract_first()
gd['goods_shop_name'] = response.xpath("//div[@class='name']/a/@title").extract_first()
gd['goods_shop_href'] = response.xpath("//div[@class='name']/a/@href").extract_first()
if gd['goods_shop_href'] is not None:
gd['goods_shop_href'] = 'https' + gd['goods_shop_href']
yield gd