现在采用分类+翻页爬取 运行出错 如何把翻页的url传给start_requests
class FyxzSpider(scrapy.Spider):
name = 'fyxz'
allowed_domains = ['fyxz.com']
# start_urls = ['http://fyxz.com/']
start_urls = 'http://fyxz.com/sort/%s?page1'
custom_settings = {
'CONCURRENT_REQUESTS': 2,
'DOWNLOAD_DELAY': 1,
}
categories = {
'yingyinbofang': '影音播放', 'xitonggongju': '系统工具', 'tongxunshejiao': '通讯社交', 'shoujimeihua': '手机美化',
'xinwenyuedu': '新闻阅读', 'sheyingtuxiang': '摄影图像', 'kaoshixuexi': '考试学习', 'wangshanggouwu': '网上购物',
'jinronglicai': '金融理财', 'shenghuoxiuxian': '生活休闲', 'lvyouchuxing': '旅游出行', 'jiankangyundong': '健康运动',
'bangongshangwu': '办公商务', 'yuerqinzi': '育儿亲子',
'xiuxianyizhi': '休闲益智', 'juesebanyan': '角色扮演', 'dongzuomaoxian': '动作冒险', 'wangluoyouxi': '网络游戏',
'feixingsheji': '飞行射击', 'jingyingcelue': '经营策略', 'paokujingsu': '跑酷竞速', 'tiyujingji': '体育竞技',
'pukeqipai': '扑克棋牌', 'fuzhugongju': '辅助工具',
}
def start_requests(self):
pass
def parse_category(self):
for key in self.categories.keys():
yield scrapy.Request(self.start_urls % key, callback=self.parse_page)
def parse_page(self, response):
lis = response.xpath('//div[@class="sort-content"]/ul/li/a/@href')
for li in lis:
yield scrapy.Request(li, callback=self.parse_item)
def parse_item(self, response, **kwargs):
loader = ItemLoader(item=AppItem(), response=response)
loader.add_xpath('name', '//div[@class="app-down-box down-panel flex"]/div[@class="info"]/h1/text()')
yield loader.load_item()