scrapy 中的爬虫部分的代码如下:
import scrapy
from bokeproject.items import BokeprojectItem
from scrapy.http import Request
class HexunspiderSpider(scrapy.Spider):
name = 'hexunspider'
allowed_domains = ['hexun.com']
start_urls = ['http://27525283.blog.hexun.com/p1/default.html']
# http://27525283.blog.hexun.com/
# http://27525283.blog.hexun.com/p2/default.html
print(start_urls)
def parse(self, response):
item = BokeprojectItem()
item['name'] = response.xpath('//div[@class="ArticleTitle"]/span/a/text()').extract()
item['url'] = response.xpath('//div[@class="ArticleTitle"]/span/a/@href').extract()
item['hits'] = response.xpath('//div[@class="ArticleInfo"]/span/text()').extract()
item['comment'] = response.xpath('//div[@class="ArticleInfo"]/a/span/text()').extract()
print(item)
yield item
for j in range(2,10):
nexturl = 'http://27525283.blog.hexun.com/p'+str(j)+'/default.html'
print(nexturl)
yield Request(nexturl,callback=self.parse)
同样在 settings.py中设置了
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
parse函数没有执行。DL们这是什么情况 是哪个地方没有设置好吗