学习scrapy第三天,在爬取wooyun白帽子精华榜的时候,不能爬取所有的页面。
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WooyunrankautoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
'''
以下信息分别为
注册日期
woyun昵称
精华漏洞数
精华比例
wooyun个人主页
'''
register_date = scrapy.Field()
nick_name = scrapy.Field()
rank_level = scrapy.Field()
essence_count = scrapy.Field()
essence_ratio = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sys
import csv
class WooyunrankautoPipeline(object):
'''
process the item returned from the spider
'''
def __init__(self):
reload(sys)
if sys.getdefaultencoding()!="utf-8":
sys.setdefaultencoding("utf-8")
file_obj = open("wooyunrank.csv","wb")
fieldnames = ["register_date","nick_name","rank_level","essence_count","essence_ratio"]
self.dict_writer = csv.DictWriter(file_obj,fieldnames=fieldnames)
self.dict_writer.writeheader()
def process_item(self,item,spider):
self.dict_writer.writerow(item)
return item
spider.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import sys
from scrapy.spider import Spider
from scrapy.selector import Selector
from wooyunrankauto.items import WooyunrankautoItem
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors import LinkExtractor
class WooyunSpider(CrawlSpider):
'''
爬取wooyun漏洞精华榜单
'''
name = "wooyunrankauto"
# 爬取速度为1s
download_delay = 2
allowed_domains = ["wooyun.org"]
start_urls = [
"http://wooyun.org/whitehats/do/1/page/1"
]
rules=[
Rule(LinkExtractor(allow=("/whitehats/do/1/page/\d+")),follow=True,callback='parse_item')
]
# def __init__(self):
# reload(sys)
# if sys.getdefaultencoding()!="utf-8":
# sys.setdefaultencoding("utf-8")
def parse_item(self,response):
sel = Selector(response)
infos = sel.xpath("/html/body/div[5]/table/tbody/tr")
items = []
for info in infos:
item = WooyunrankautoItem()
item["register_date"] = info.xpath("th[1]/text()").extract()[0]
item["rank_level"] = info.xpath("th[2]/text()").extract()[0]
item["essence_count"] = info.xpath("th[3]/text()").extract()[0]
item["essence_ratio"] = info.xpath("th[4]/text()").extract()[0]
item["nick_name"] = info.xpath("td/a/text()").extract()[0]
items.append(item)
return items
上面的spider.py只能爬取1,2,3,4,5页(日志中显示爬取六次,第一页被重复爬取了)
但是浏览第5页的时候,6,7,8,9页也会出现啊,这里为什么没有爬取到6,7,8,9
第二个版本的spider.py
def parse_item(self,response):
sel = Selector(response)
infos = sel.xpath("/html/body/div[5]/table/tbody/tr")
items = []
for info in infos:
item = WooyunrankautoItem()
item["register_date"] = info.xpath("th[1]/text()").extract()[0]
item["rank_level"] = info.xpath("th[2]/text()").extract()[0]
item["essence_count"] = info.xpath("th[3]/text()").extract()[0]
item["essence_ratio"] = info.xpath("th[4]/text()").extract()[0]
item["nick_name"] = info.xpath("td/a/text()").extract()[0]
items.append(item)
return item
这个版本可以爬取所有页面,但是每个页面有20条信息,我只能取到第一条信息(循环第一条的时候就返回了,这里可以理解)但是为什么这里就可以爬取所有页面
可能是我对scrapy理解还不深入,这里实在不知道什么问题了,我想自动爬取所有页面(而且不会重复爬取),每个页面有20条信息,应该就是20个item。