问题遇到的现象和发生背景
当我使用scrapy CrawlSpider模板爬取网页时,报了如下错误。
2022-05-22 18:12:45 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://xxx/xxx/xxxx> (failed 1 times): [<twisted.python.failure.Failure OpenSSL.S
SL.Error: [('SSL routines', '', 'unexpected eof while reading')]>]
问题相关代码,请勿粘贴截图
我的spider代码
class ReadSpider(CrawlSpider):
name = 'read'
allowed_domains = ['xxxxx']
start_urls = ['xxxxx']
rules = (
Rule(LinkExtractor(allow=r'/book/[\d]+_[\d]+.html'), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths='//div[@id="tab1"]/div[@class="class-nav"]/a'), callback='parse_item',
follow=True),
)
# follow=True根据执行完解析类后的response继续提取
def parse_item(self, response):
# item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
# item['name'] = response.xpath('//div[@id="name"]').get()
# item['description'] = response.xpath('//div[@id="description"]').get()
books = response.xpath('//div[@class="bookslist"]/ul/li/div')
for book in books:
item = DushuItem()
name = book.xpath('./h3/a/text()').get()
p = book.xpath('./p')
author = p[0].xpath('./text()').get()
info = p[1].xpath('./text()').get()
imgUrl = book.xpath('./div/a/img/@src').get()
infoUrl = 'htttp://xxxxxx.com' + book.xpath('./h3/a/@href').get()
item['name'] = '《' + name + '》'
item['author'] = author
item['imgUrl'] = imgUrl
item['info'] = info
# return scrapy.Request(url=infoUrl, callback=self.parseDetail, meta={'item': item})
# yield item
print('------------',infoUrl[0:len(infoUrl)-1])
# print(infoUrl[1:])
yield scrapy.Request(url=infoUrl[0:len(infoUrl)-1], callback=self.parseDetail, meta={'item': item})
def parseDetail(self, response):
item = response.meta['item']
infoS = response.xpath('//div[@class="text txtsummary"]/text()').get()
# info = re.findall(r'\u3000\u3000(.*)', infoS)
# info = re.sub(r'"', r'\"', infoS)
# if info:
# item['info'] = info[0]
# else:
# item['info'] = infoS
item['info'] = infoS
yield item
我的解答思路和尝试过的方法
把yield scrapy.Request(url=infoUrl[0:len(infoUrl)-1], callback=self.parseDetail, meta={'item': item})
换成yield item
可以正常爬取,这是为什么呢?