weixin_44740202 2020-09-18 22:43 采纳率: 40%
浏览 302

scrapy中的ImagesPipeline无法对图片进行储存

爬虫文件

class Dm5Spider(scrapy.Spider):
    name = 'dm5'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://www.dm5.com/manhua-list/']
    url = 'http://www.dm5.com/manhua-list-p{}/'

    def parse(self, response):
        li_list = response.xpath('/html/body/section[2]/div/ul/li')
        for li in li_list:
            comic_title = li.xpath('./div/div[1]/h2/a/text()').extract_first()
            comic_url = 'http://www.dm5.com/' + li.xpath('./div/div[1]/h2/a/@href').extract_first()
            item = Dm5ProItem()
            item['comic_title'] = comic_title
            yield scrapy.Request(comic_url, callback=self.chapter_parse, meta={'item': item})

    def chapter_parse(self, response):
        li_list = response.xpath('//*[@id="detail-list-select-1"]/li')
        for li in li_list:
            chapter_name = li.xpath('./a/text()').extract_first()
            chapter_url = 'http://www.dm5.com/' + li.xpath('./a/@href').extract_first()
            item = response.meta['item']
            item['chapter_name'] = chapter_name.strip()
            yield scrapy.Request(chapter_url, callback=self.content_parse, meta={'item': item})

    def content_parse(self, response):
        page_text = response.text
        cid = re.findall('var DM5_CID=(.+?);', page_text)[0].strip()
        page_count = re.findall('var DM5_PAGEPCOUNT =(.+?);', page_text)[0].strip()
        _mid = re.findall('var DM5_MID=(.+?);', page_text)[0].strip()
        _dt = re.findall('var DM5_VIEWSIGN_DT="(.+?)";', page_text)[0].strip()
        _sign = re.findall('var DM5_VIEWSIGN="(.*?)";', page_text)[0].strip()
        page = 1
        while page < int(page_count):
            url = f'{response.url}chapterfun.ashx?cid={cid}&page=1&key=&language=1&gtk=6&_cid={cid}&_mid={_mid}&_dt={_dt}&_sign={_sign}'
            yield scrapy.Request(url,callback=self.content_parse_2,meta={'item':response.meta['item']})

    def content_parse_2(self, response):
        js_code = response.text
        img_urls = execjs.eval(js_code)
        img_url = img_urls[0]
        item = response.meta['item']
        item['src'] = img_url
        img_name = wurl.parse_url(img_url).path.split('/')[-1]
        item['img_name'] = img_name
        yield item

items

class Dm5ProItem(scrapy.Item):
    # define the fields for your item here like:
    comic_title = scrapy.Field()
    chapter_name = scrapy.Field()
    src = scrapy.Field()
    img_name = scrapy.Field()

settings


import os
BOT_NAME = 'dm5pro'
SPIDER_MODULES = ['dm5pro.spiders']
NEWSPIDER_MODULE = 'dm5pro.spiders'
LOG_LEVEL = 'ERROR'

DEFAULT_REQUEST_HEADERS = {
    'user-agent' : "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
    'referer': 'http://www.dm5.com/'

}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')

ITEM_PIPELINES = {
    'dm5pro.pipelines.Dm5ProPipeline': 300,
}
ROBOTSTXT_OBEY = False

pipelines


class Dm5ProPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        print(f'{item["src"]}正在下载~~')
        yield scrapy.Request(item['src'], meta={'item': item})

    def file_path(self, request, response=None, info=None):
        comic_title = request.meta['item']['comic_title'] #获取动漫名字
        image_store = settings.IMAGES_STORE             # 获取settings中的路径
        comic_title_path = os.path.join(image_store, comic_title) # 对路径跟动漫名字进行拼接
        chapter_name = request.meta['item']['chapter_name'] # 获取章节名称
        chapter_name_path = os.path.join(comic_title_path, chapter_name) # 对路径跟章节名称进行拼接
        if not os.path.exists(chapter_name_path):  # 如果没有这个文件夹
            os.makedirs(chapter_name_path)        # 对这个文件夹进行创建
        # image_name = path.replace("full/", "")
        img_name = request.meta['item']['img_name'] # 获取当前图片名称
        image_path = os.path.join(chapter_name_path, img_name) #对路径跟图片名称进行拼接
        print(image_path) 
        # PycharmProjects/爬虫练手/2.100个简单练手的网站/dm5动漫/dm5pro/images/成为反派的继母/第0话/1_1732.jpg

        return image_path  # 将路径返回

    def item_completed(self, results, item, info):
        return item

链接打印为:PycharmProjects/爬虫练手/2.100个简单练手的网站/dm5动漫/dm5pro/images/成为反派的继母/第0话/1_1732.jpg

但是无法对jpg图片进行储存,到第0话就不在创建图片了

  • 写回答

1条回答 默认 最新

  • zqbnqsdsmd 2020-09-19 00:08
    关注
    评论

报告相同问题?

悬赏问题

  • ¥15 基于STM32心率血氧监测(OLED显示)相关代码运行成功后烧录成功OLED显示屏不显示的原因是什么
  • ¥15 Python动态规划:图2箭头指向的代码如何理解?太抽象了
  • ¥100 X轴为分离变量(因子变量),如何控制X轴每个分类变量的长度。
  • ¥30 求给定范围的全体素数p的(p-2)的连乘积
  • ¥15 VFP如何使用阿里TTS实现文字转语音?
  • ¥100 需要跳转番茄畅听app的adb命令
  • ¥50 寻找一位有逆向游戏盾sdk 应用程序经验的技术
  • ¥15 请问有用MZmine处理 “Waters SYNAPT G2-Si QTOF质谱仪在MSE模式下采集的非靶向数据” 的分析教程吗
  • ¥50 opencv4nodejs 如何安装
  • ¥15 adb push异常 adb: error: 1409-byte write failed: Invalid argument