请问大家我在爬取一个叫Steinmart.com的网站,大家能帮我加一下注释吗?(越详细越好)以下是我的代码
import copy
import logging
import time
import json
import openpyxl
from scrapy import cmdline
from steinmart_com.items import SteinmartComItem
from scrapy.http.response.html import HtmlResponse
from steinmart_com.settings import ONLINE
import scrapy
import re
logging.basicConfig(filename=r'G:\spider_demo\spider\demo\steinmart_com\Steinmart.log', level=logging.INFO,
format='%(asctime)s %(message)s', filemode='w')
class Steinmart(scrapy.Spider):
name='Steinmart'
allowed_domains=['steinmart.com']
start_urls=['https://steinmart.com/']
index_url='https://steinmart.com/'
def start_requests(self):
if ONLINE:
wb=openpyxl.load_workbook('/home/yang/steinmart_com/steinmart_com/steinmart_com.xlsx')
else:
wb = openpyxl.load_workbook(r'G:\spider_demo\spider_demo\steinmart_com\steinmart_com\steinmart_com.xlsx')
ws=wb.active
for r in range(2,ws.max_row+1):
url=ws.cell(r,1).value
if not url:
continue
one, two, three, four=ws.cell(r,2).value, ws.cell(r,3).value, ws.cell(r,4).value, ws.cell(r,5).value
meta = {'url': url, 'category': one if one else'', 'sub_category': two if two else'',
'third_category': three if three else '','subdivision_cat':four if four else ''}
yield scrapy.Request(url, callback=self.parse, dont_filter = True, meta = meta)
def parse(self, response: HtmlResponse, ** kwangs):
meta=response.meta
sale_num=response.xpath('//*[eclass="collection-count"]/textO').get()
sale_num1=sale_num.strip()
sale_num2=sale_num1[:-5]
page_num=int(sale_num2)//20
if page_num:
for i in range(1, int(page_num)+1):
new_url=str(response.url)+'?page='+str(i)
yield scrapy.Request(new_url, callback=self.parse_list, dont_filter=True, meta=meta)
def parse_list(self, response: HtmlResponse):
meta = response.meta
product_list=response.xpath("//*[@class='col-lg-3 col-md-4 col-Sm-6']")
for product in product_list:
if not product.xpath("./a/div/div"):
continue
meta['brand_name']=product.xpath("./a/div[2]/div[1]/textO").get()
meta['url'] = self.index.url+product.xpath("./a/@href").get()
meta['title']=product.xpath("./a/div[2]/div[1]/textO").get()
meta['product_small_image'] = product.xpath("./a/div/div/img/@data-src").get()
yield scrapy.Request(meta['url'], callback=self.parse_detail, dont_filter = True, meta = meta)
def parse_detail(self, response: HtmlResponse):
meta = response.meta
item = SteinmartcomItem()
item['url'] = meta['url']
item['title'] = meta['title']
item['price_now'] = str(response.xpath('//*[@class="container"]/div[2]/div[2]/div[2]/div/textO)').get()).replace('$', '').replace('\t', '').replace('\n', '').strip()
print(item['title'])
print(item['price_now'])