问题:
爬取网站:http://www.52jingsai.com/bisai/keji/index.php?jsstatus=2&jssort=0
目的:按照标签爬取每个竞赛的信息
spider代码:
class CsSpider(scrapy.Spider):
name = 'cs'
allowed_domains = ['52jingsai.com']
start_urls = ['http://www.52jingsai.com/bisai/keji/index.php?jsstatus=2&jssort=0']
# 获取活动对象标签
def parse(self, response):
li_lst = [i.xpath('.//a/@href').get() for i in response.xpath('//div[@class="js"]/div[2]/ul/li')[2::]]
li_text = [i.xpath('.//a/text()').get().strip() for i in response.xpath('//div[@class="js"]/div[2]/ul/li')[2::]]
for num in range(len(li_lst)):
item = CompetitionsItem(competition_level=li_text[num])
yield scrapy.Request(
url=li_lst[num],
callback=self.order_parse,
meta={'item': deepcopy(item)}
)
# 获取竞赛排序标签
def order_parse(self, response):
item = response.meta.get('item')
li_lst = [i.xpath('./@href').get() for i in response.xpath('//div[@class="js"]/div[3]/ul/li/a')]
li_text = [i.xpath('./text()').get() for i in response.xpath('//div[@class="js"]/div[3]/ul/li/a')]
for num in range(len(li_lst)):
item['competitions_label'] = li_text[num]
yield scrapy.Request(
url=li_lst[num],
callback=self.details,
meta={'item': deepcopy(item)}
)
# 获取详细信息
def details(self, response):
print(response.meta['item'])
这是怎么回事?第一次见,求解!