import scrapy
from douban.items import DoubanItem
import scrapy.utils.misc
import scrapy.core.scraper
def warn_on_generator_with_return_value_stub(spider, callable):
pass
scrapy.utils.misc.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
scrapy.core.scraper.warn_on_generator_with_return_value = warn_on_generator_with_return_value_stub
class DbchuancanSpider(scrapy.Spider):
name = "dbchuancan"
# allowed_domains = ["www.xxx.com"]
start_urls = ["https://movie.douban.com/top250?start=0&filter="]
def page_detail(self,response):
item = response.meta['item']
score = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0].extract()
# print(score)
item['score'] = score
# print(item)
yield item
def parse(self, response):
div_list = response.xpath('//div[@class="hd"]')
for div in div_list:
item = DoubanItem()
title = div.xpath('./a/span[1]/text()')[0].extract()
item['title'] = title
detail_url = div.xpath('./a/@href')[0].extract()
yield scrapy.Request(url=detail_url,callback=self.page_detail,meta={'item':item})
# print(title,detail_url)
爬取豆瓣电影top250电影名及评分,经检查在parse函数中print时能输出25条信息,但是到了page_detail函数中print时数据量就变少了很多,3~6条信息,而且输出的基本都是那25条信息中靠后的信息,不太清楚怎么回事,请大家解惑
以下是日志信息:
(venv) PS C:\python_learning\douban> scrapy crawl dbchuancan
2024-02-12 00:09:30 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: douban)
2024-02-12 00:09:30 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.11.4 (tags/v3.11.4:d2340ef, Jun 7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)], pyOpenSSL 24.0.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.2, Platform Windows-10-10.0.22621-SP0
2024-02-12 00:09:30 [scrapy.addons] INFO: Enabled addons:
[]
2024-02-12 00:09:30 [asyncio] DEBUG: Using selector: SelectSelector
2024-02-12 00:09:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2024-02-12 00:09:30 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.windows_events._WindowsSelectorEventLoop
2024-02-12 00:09:30 [scrapy.extensions.telnet] INFO: Telnet Password: ef09d2f430b48558
2024-02-12 00:09:30 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.logstats.LogStats']
2024-02-12 00:09:30 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'douban',
'FEED_EXPORT_ENCODING': 'utf-8',
'NEWSPIDER_MODULE': 'douban.spiders',
'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
'SPIDER_MODULES': ['douban.spiders'],
'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
2024-02-12 00:09:30 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloader