我用python爬虫去哪网热门景点信息,结果只爬到了两页的内容,不知道是哪的问题,有大佬帮忙看看:
-*- coding: utf-8 -*-
created by:tianxing
created date:2017-11-1
import scrapy
import re
import datetime
from practice.items import QvnaItem
class QuNaSpider(scrapy.Spider):
name = 'qvnawang'
#start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?pd=1&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&sf=0&st=99999&isadv=1&sg=1545043c61dd44d5bf41f9913890abfa&p=1']
start_urls = ['http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9®ion=&from=mpl_search_suggest&subject=']
def parse(self,response):
item = QvnaItem()
#得到初始展示页面的基准xpath(某一页)
#pages = response.xpath('//div[@style="width: 224px;*width: 218px; _width:200px; float: left"]/a/@href')
pages = response.xpath('//div[@class="sight_item_pop"]/table/tr[3]/td/a/@href')
#循环取出每一页上的每一个链接url地址,并调用parse_page函数解析每一个url上的页面内容
for eachPage in pages:
#获取链接URL(页面上所有的链接,每个链接单独处理)
#singleUrl = eachPage.extract()
singleUrl = 'http://piao.qunar.com'+eachPage.extract()
#内部调用parse_page函数
yield scrapy.Request(url = singleUrl,meta={'item':item},callback=self.parse_page)
#取得除最后一页之外的 '下一页' 的xpath
try:
if response.xpath('//div[@class="pager"]/a/@class').extract()[0] == 'next':
nextPage = 'http://piao.qunar.com' + response.xpath('//div[@class="pager"]/a/@href').extract()[0]
# 递归调用,将下一页的URL传进Request函数
yield scrapy.Request(url=nextPage, callback=self.parse)
except IndexError as ie:
# 因最后一页没有上述xpath,所以不满足条件,即可退出递归
try:
exit()
except SystemExit as se:
pass
#爬取单个链接对应的页面内容
def parse_page(self, response):
# 通过meta得到item
item = response.meta['item']
tour_info = response.xpath('/html/body/div[2]/div[2]/div[@class="mp-description-detail"]')
#景点名称
try:
item['name'] = tour_info.xpath('div[1]/span[1]/text()').extract()[0]\
.replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
except IndexError as ie:
item['name'] = ''
#景点等级
try:
item['rank'] = tour_info.xpath('div[1]/span[2]/text()').extract()[0]\
.replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
except IndexError as ie:
item['rank'] = 0
#景点描述
try:
item['decription'] = tour_info.xpath('div[2]/text()').extract()[0]\
.replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
except IndexError as ie:
item['decription'] = ''
#景点地点
try:
item['address'] = tour_info.xpath('div[3]/span[3]/text()').extract()[0]
item['address'] = item['address'].replace('/',',').replace(u'、','')\
.replace(u'(',',').replace('(',',').replace(u')','').replace(')','')\
.replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
except IndexError as ie:
item['address'] = ''
#用户评价
try:
item['comment'] = tour_info.xpath('div[4]/span[3]/span/text()').extract()[0]\
.replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
except IndexError as ie:
item['comment'] = ''
#天气情况
try:
item['weather'] = tour_info.xpath('div[5]/span[3]/text()').extract()[0]\
.replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
except IndexError as ie:
item['weather'] = ''
#门票最低价格
try:
item['lowprice'] = tour_info.xpath('div[7]/span/em/text()').extract()[0]\
.replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
except IndexError as ie:
item['lowprice'] = ''
#发布日期
today = datetime.datetime.now()
item['date'] = today.strftime('%Y-%m-%d')
yield item