爬虫程序:
redis_key = 'xinfang_detail'
# rules = (
# Rule(LinkExtractor(allow=r'fang\.anjuke\.com\/loupan\/\d+\.html'), callback='parse_item', follow=True),
# )
def parse_item(self, response):
print("*"*20 + "开始爬取" + response.url)
item = XinfangItem()
# 房区名
item['title'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/div/div/div[1]/h1/text()').extract_first()
# 价格
item['price'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()').extract_first()
# 地址
yield item
反馈:
2021-05-14 10:38:46 [scrapy.core.scraper] ERROR: Spider error processing <GET https://ly.fang.anjuke.com/loupan/448966.html> (referer: https://ly.fang.anjuke.com/loupan/437877.html) Traceback (most recent call last): File "d:\desktop\anjuke1.0\venv\lib\site-packages\twisted\internet\defer.py", line 662, in _runCallbacks current.result = callback(current.result, *args, **kw) File "d:\desktop\anjuke1.0\venv\lib\site-packages\scrapy\spiders\crawl.py", line 105, in _callback rule = self._rules[response.meta['rule']] IndexError: list index out of range