import scrapy
class AnimeSpider(scrapy.Spider):
name = 'anime'
allowed_domains = ['dmzj.com']
# 第一层 爬取所有漫画---首页
start_urls = ['https://www.dmzj.com/category']
def parse(self, response):
# scrapy内置解析数据 5个大分类的名字 span
span_list=response.xpath('//div[@class="public_com"]/span[2]')
# print(span_list)
# 遍历5个大分类
for span in span_list:
item={}
item['category'] = span.xpath('./text()').extract_first()
# 根据大分类取小分类
li_list = span.xpath('./following-sibling::*/li/a')
# li_list1 =span.xpath('./following-sibling::*/span/li/a')
# for li1 in li_list1:
# item['style'] = li1.xpath('./text()').extract_first()
# print(item)
for li in li_list:
item['small_category'] = li.xpath('./text()').extract_first()
small_link = 'http:'+li.xpath('./@href').extract_first()
# print('*' * 100)
# print(item)
直到这都可以爬到想要的数据,下面就不行了
yield scrapy.Request(small_link, callback=self.parse_anime, meta={'anime': item})
# 解析漫画信息
def parse_anime(self,response):
item = response.meta.get('anime')
# 解析所有的漫画, 18
list_anime = response.xpath('//div/ul[@class="list_con_li"]/li')
# print(list_anime)
我在这里试过输出,也没有
# 遍历解析18本漫画的详细信息
for anime in list_anime:
# 漫画名
item['name'] = anime.xpath('.//h3/a/text()').extract_first()
# 作者
item['author'] = anime.xpath('.//p[1]/text()').extract_first()
# 类型
item['style'] = anime.xpath('.//p[2]/text()').extract_first()
# 状态
item['status'] = anime.xpath('.//p[3]/text()').extract_first()
# 更新
item['renew'] = anime.xpath('.//p[4]/text()').extract_first()
# 图片地址
item['default_image'] = anime.xpath('.//a[@class="comic_img"]/img/@src').extract_first()
print(item)