爬取不到网页图片的下载地址,别的id和name都可以得到
不知道是不是正则表达式的问题
爬取网站链接:https://www.ssense.com/en-cn/women?q=top
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from ssense.items import SsenseItem
import re
class SsensePicSpider(scrapy.Spider):
name = 'ssense_pic'
allowed_domains = ['ssense.com']
start_urls = ['http://ssense.com/']
def parse(self, response):#定义解析函数
search_word = 'top'#查找词,可修改
for i in range(1, 2):#爬取所有网页
url = 'http://www.ssense.com/en-cn/women?q=' + str(search_word) + '&page=' + str(i)
#print(url)
yield Request(url=url, callback=self.page)
pass
# 爬取商品url
def page(self, response):
body = response.body.decode('utf-8', 'ignore')
url_id = '"url":\s"([/a-z-0-9]*)"'
item_id = re.compile(url_id).findall(body) #获取商品url
#print(item_id)
for i in range(0, len(item_id)):
this_id = item_id[i]
website = 'https://www.ssense.com/en-cn' + str(this_id) # 商品链接
yield Request(url=website, callback=self.next)
pass
pass
def next(self, response):
item = SsenseItem()
body = response.body.decode('utf-8', 'ignore')
# 获取商品productID
pro_id = '"productID":\s(\d{7})'
productID = re.compile(pro_id).findall(body)
item['productID'] = productID
#获取商品name
item_name = '"name":\s"([a-zA-Z -]*)"[,]'
name = re.compile(item_name).findall(body)
item['name'] = name
#获取商品price
item_price = '"price":\s([0-9]*)'
price = re.compile(item_price).findall(body)
item['price'] = price
# 获取sku
item_sku = '"sku":\s"([0-9A-Z]*)",'
sku = re.compile(item_sku).findall(body)
item['sku'] = sku
#获取图片url
item_image = '"image":\s"([a-z:/.0-9A-F_-]*)"'
image = re.compile(item_image).findall(body)
item['image'] = image
print(type(image))
yield item
pass