from scrapy.spider import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
class Spider(CrawlSpider):
name = 'wordSpider'
NUM = 14220485
start_urls = [
"http://baike.baidu.com/view/1.htm"
]
fi = open('e:/word.txt', 'w')
cnt = 2
def parse(self,response):
selector = Selector(response)
word = selector.xpath('body/div[@class="body-wrapper"]/div[@class="content-wrapper"]/div[@class="content"]/div[@class="main-content"]/dl/dd/h1/text()').extract_first()
#word = selector.xpath('body/div[@id="J-lemma"]/div[@class="body-wrapper"]/div[@class="card-part"]/span[@class="lemma-title"]/text()').extract()
self.fi.write(word + '\t' + 'n')
if self.cnt <= self.NUM:
wurl = "http://baike.baidu.com/view/%s.htm" % self.cnt
self.cnt += 1
yield Request(url=wurl, meta={}, callback=self.parse)
这是我的爬虫源码,如何阻止301/302重定向,要抓取的是百度所有词条,但是总会发生重定向导致无法获得想要的网页