使用python scrapy+ selenium时,用yield item 提交出现错误
这是爬虫文件
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options # 使用无头浏览器
from selenium.webdriver import ActionChains
import time
# from scrapy.http import HtmlResponse
from weiqi.items import FileItem
chorme_options = Options()
chorme_options.add_argument("--headless")
chorme_options.add_argument("--disable-gpu")
# 检查元素是否存在
def check_element_exists(driver, element, condition):
try:
if condition == 'class':
driver.find_element_by_class_name(element)
elif condition == 'id':
driver.find_element_by_id(element)
elif condition == 'xpath':
driver.find_element_by_xpath(element)
return True
except Exception as e:
return False
class Number1Spider(scrapy.Spider):
name = 'number2'
# allowed_domains = ['www.asxs.net']
start_urls = ['https://www.101weiqi.com/newbook/#/book/42983/']
def __init__(self):
self.bro = webdriver.Chrome()
super().__init__()
def parse(self, response):
print(response)
url='https://www.101weiqi.com/newbook/#/book/42983/'
yield scrapy.Request(url,callback=self.parse_page,meta={'is_bro':True})
pass
def parse_page(self,response):
#点击进入题目列表页
self.element_click('//*[@id="root"]/div/div[2]/div/div/div/div[1]')
time.sleep(3)
#获取图片地址
print('get_img')
self.get_img_urls(response)
#通过xpath获取元素并单击
def element_click(self,xpath):
soutu_btn = self.bro.find_element_by_xpath(xpath) # 将需要单击的WebElement对象定义为day
actions = ActionChains(self.bro) # 实例化Actions类对象: actions, 并将driver传给actions
actions.move_to_element(soutu_btn).click().perform()
def get_img_urls(self,response):
print('geting_imgsrc')
img_url =self.bro.find_elements_by_xpath('//div[@class="sc-gGCDDS tzeuw"]//img')
path_name = self.bro.find_elements_by_xpath('//*[@id="root"]/div/div[2]/div/div[1]/div[1]/span[3]')[0].text
file_urls = []
file_names = []
for link in img_url:
file_urls.append(link.get_attribute('src'))
file_names.append(link.get_attribute('src').split('/')[-1])
item = FileItem()
item['file_names'] = file_names
item['file_urls'] = file_urls
#这里,只要用yield item ,就会导致整个函数不再运行。
#如果只是print 出来,是没有问题的
yield item
def closed(self,spider):
print('关闭浏览器对象!')
self.bro.quit()
最后使用yield 提交item的时候,会导致整个get_img_urls函数不运行。
我尝试过将yield item 单独写一个函数,则只要有yield item的函数就不运行。换成print(item)则可以正常打印出来。
下面是同一个工程下的另一个爬虫,yield item 没有问题。
import scrapy
from selenium import webdriver
from weiqi.items import FileItem
from selenium.webdriver.chrome.options import Options # 使用无头浏览器
chorme_options = Options()
chorme_options.add_argument("--headless")
chorme_options.add_argument("--disable-gpu")
class Number1Spider(scrapy.Spider):
name = 'number1'
# allowed_domains = ['www.asxs.net']
start_urls = ['https://www.101weiqi.com/newbook/#/chapter/53608/']
def __init__(self):
self.bro = webdriver.Chrome()
super().__init__()
#获取索引页内容
def parse(self, response):
imgs = response.xpath('//div[@class="sc-gGCDDS tzeuw"]//img/@src').extract()
imgname = []
for imgurl in imgs:
imgname.append(imgurl.split('/')[-1])
print(imgname)
item = FileItem()
item['file_urls'] = imgs
item['file_names'] = imgname
print(item)
yield item
def closed(self,spider):
print('关闭浏览器对象!')
self.bro.quit()
实在找不到原因,请各位朋友帮忙!