python爬虫去哪网热门景点

我用python爬虫去哪网热门景点信息,结果只爬到了两页的内容,不知道是哪的问题,有大佬帮忙看看:

-*- coding: utf-8 -*-

created by:tianxing

created date:2017-11-1

import scrapy
import re
import datetime
from practice.items import QvnaItem

class QuNaSpider(scrapy.Spider):
name = 'qvnawang'
#start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?pd=1&jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&sm=0&sf=0&st=99999&isadv=1&sg=1545043c61dd44d5bf41f9913890abfa&p=1']
start_urls = ['http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9&region=&from=mpl_search_suggest&subject=']
def parse(self,response):
item = QvnaItem()
#得到初始展示页面的基准xpath(某一页)
#pages = response.xpath('//div[@style="width: 224px;*width: 218px; _width:200px; float: left"]/a/@href')
pages = response.xpath('//div[@class="sight_item_pop"]/table/tr[3]/td/a/@href')

    #循环取出每一页上的每一个链接url地址,并调用parse_page函数解析每一个url上的页面内容
    for eachPage in pages:
        #获取链接URL(页面上所有的链接,每个链接单独处理)
        #singleUrl = eachPage.extract()
        singleUrl = 'http://piao.qunar.com'+eachPage.extract()
        #内部调用parse_page函数
        yield scrapy.Request(url = singleUrl,meta={'item':item},callback=self.parse_page)



    #取得除最后一页之外的 '下一页' 的xpath
    try:
        if response.xpath('//div[@class="pager"]/a/@class').extract()[0] == 'next':
            nextPage = 'http://piao.qunar.com' + response.xpath('//div[@class="pager"]/a/@href').extract()[0]
            # 递归调用,将下一页的URL传进Request函数
            yield scrapy.Request(url=nextPage, callback=self.parse)
    except IndexError as ie:
        # 因最后一页没有上述xpath,所以不满足条件,即可退出递归
        try:
            exit()
        except SystemExit as se:
            pass


#爬取单个链接对应的页面内容
def parse_page(self, response):
      # 通过meta得到item
      item = response.meta['item']


      tour_info = response.xpath('/html/body/div[2]/div[2]/div[@class="mp-description-detail"]')

      #景点名称
      try:
          item['name'] = tour_info.xpath('div[1]/span[1]/text()').extract()[0]\
          .replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
      except IndexError as ie:
          item['name'] = ''

      #景点等级
      try:
          item['rank'] = tour_info.xpath('div[1]/span[2]/text()').extract()[0]\
           .replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
      except IndexError as ie:
          item['rank'] = 0

      #景点描述
      try:
          item['decription'] = tour_info.xpath('div[2]/text()').extract()[0]\
          .replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
      except IndexError as ie:
          item['decription'] = ''

      #景点地点
      try:
          item['address'] = tour_info.xpath('div[3]/span[3]/text()').extract()[0]
          item['address'] = item['address'].replace('/',',').replace(u'、','')\
               .replace(u'(',',').replace('(',',').replace(u')','').replace(')','')\
               .replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
      except IndexError as ie:
          item['address'] = ''

      #用户评价
      try:
          item['comment'] = tour_info.xpath('div[4]/span[3]/span/text()').extract()[0]\
          .replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
      except IndexError as ie:
          item['comment'] = ''

      #天气情况
      try:
          item['weather'] = tour_info.xpath('div[5]/span[3]/text()').extract()[0]\
          .replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
      except IndexError as ie:
          item['weather'] = ''

      #门票最低价格
      try:
          item['lowprice'] = tour_info.xpath('div[7]/span/em/text()').extract()[0]\
          .replace('/',',').replace('\r','').replace('\n','').replace('\t','').replace(' ','').replace('\xa0','').replace('\u3000','')
      except IndexError as ie:
          item['lowprice'] = ''

      #发布日期
      today = datetime.datetime.now()
      item['date'] = today.strftime('%Y-%m-%d')



      yield item

1个回答

用fiddler抓包看下,要么是第三页的地址或者参数没有对,要么是服务器有反爬虫的机制(比如频繁访问,返回错误页面、验证码)。

Csdn user default icon
上传中...
上传图片
插入图片
抄袭、复制答案,以达到刷声望分或其他目的的行为,在CSDN问答是严格禁止的,一经发现立刻封号。是时候展现真正的技术了!
立即提问
相关内容推荐