请求各位大佬帮忙看一下,我的问题所在。最近在学习Scrapy的爬虫框架,对照北理嵩天老师的代码,重新选取网站做的,使用scrapy crawl命令运行时,在cmd命令行中无错误,但是在txt文件内没有内容,下面是我的代码,请各位大佬帮忙debug下。
爬取的网站为股城网:
股票列表链接:https://hq.gucheng.com/gpdmylb.html
个股信息链接:https://hq.gucheng.com/'+ stock
1.Spider中爬虫文件(stocks.py)
import scrapy
import re
class StocksSpider(scrapy.Spider):
name = 'stocks'
#allowed_domains = ['hq.gecheng.com']
start_urls = ['https://hq.gucheng.com/gpdmylb.html']
def parse(self, response):
# 对a标签中的链接进行提取
kv = {'user-agent': 'Mozilla/5.0'} # 模拟浏览器发送请求
for href in response.css('a::attr(href)').extract():
try:
stock = re.findall(r"[S][HZ]\d{6}",href)[0]#通过正则表达式获取正确的股票代码
url = 'https://hq.gucheng.com/' + stock
yield scrapy.Request(url,callback=self.parse_stock,headers=kv)
#第二个参数callback给出了处理当前url给出的新的函数即parse_stock
#return item
except:
continue
def parse_stock(self, response):
infoDict = {} # 对每一个页面生成空字典
stockInfo = response.css('.stock_top clearfix')
name = stockInfo.css('.stock_title').extract()[0]
keyList = stockInfo.css('dt').extract()
valueList = stockInfo.css('dd').extract()
for i in range(len(keyList)):
key = re.findall(r'<dt>.*</dt>', keyList[i])[0][1:-5]
# key = key.replace('\u2003','')
# key = key.replace('\xa0', '')
try:
val = re.findall(r'<dd>\d+\.?.*</dd>', valueList[i])[0][0:-5]
except:
val = '--'
infoDict[key] = val
infoDict.update(
{'股票名称': re.findall('\s.*\(', name)[0].split()[0] + re.findall('\>.*\<', name)[0][1:-1]})
yield infoDict
2.pipelines.py
class GuchengstocksInfoPipeline(object):
#openspider指的是当一个爬虫被调用时对应的pipline启动的方法
def open_spider(self, spider):
self.f = open('GuchengStockInfo.txt', 'w')
#close_spider指的是当一个爬虫关闭时对应的pipline启动的方法
def close_spider(self, spider):
self.f.close()
#对每一个item项进行处理时对应的方法,也是最主体的函数
def process_item(self, item, spider):
try:
line = str(dict(item)) + '\n'
self.f.write(line)
except:
pass
return item
3.配置文件settings
ITEM_PIPELINES = {
'GuchengStocks.pipelines.GuchengstocksInfoPipeline': 300,
}