colorful_daisy 2019-11-25 09:15 采纳率: 0%
浏览 371

采用scrapy框架爬取二手房数据,显示没有爬取到页面和项目,不清楚问题原因

1.item

import scrapy
class LianjiaItem(scrapy.Item):
    # define the fields for your item here like:
    # 房屋名称
    name = scrapy.Field()
    # 房屋户型
    type = scrapy.Field()
    # 建筑面积
    area = scrapy.Field()
    # 房屋朝向
    direction = scrapy.Field()
    # 装修情况
    fitment = scrapy.Field()
    # 有无电梯
    elevator = scrapy.Field()
    # 房屋总价
    total_price = scrapy.Field()
    # 房屋单价
    unit_price = scrapy.Field()
    # 房屋产权
    property = scrapy.Field()

2.settings

    BOT_NAME = 'lianjia'
    SPIDER_MODULES = ['lianjia.spiders']
    NEWSPIDER_MODULE = 'lianjia.spiders'
    USER_AGENT = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"
    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES = {
   'lianjia.pipelines.FilterPipeline': 100,
   'lianjia.pipelines.CSVPipeline': 200,
}

3.pipelines

import re
from scrapy.exceptions import DropItem
class FilterPipeline(object):
    def process_item(self,item,spider):
        item['area'] = re.findall(r"\d+\.?\d*",item["area"])[0]
        if item["direction"] == '暂无数据':
            raise DropItem("房屋朝向无数据,抛弃此项目:%s"%item)
        return item
class CSVPipeline(object):
    index = 0
    file = None
    def open_spider(self,spider):
        self.file = open("home.csv","a")
    def process_item(self, item, spider):
        if self.index == 0:
            column_name = "name,type,area,direction,fitment,elevator,total_price,unit_price,property\n"
            self.file.write(column_name)
            self.index = 1
        home_str = item['name']+","+item['type']+","+item['area']+","+item['direction']+","+item['fitment']+","+item['elevator']+","+item['total_price']+","+item['unit_price']+","+item['property']+"\n"
        self.file.write(home_str)
        return item
    def close_spider(self,spider):
        self.file.close()

4.lianjia_spider

import scrapy
from scrapy import Request
from lianjia.items import LianjiaItem

class LianjiaSpiderSpider(scrapy.Spider):
    name = 'lianjia_spider'
    # 获取初始请求
    def start_requests(self):
        # 生成请求对象
        url = 'https://bj.lianjia.com/ershoufang/'
        yield Request(url)
    # 实现主页面解析函数
    def parse(self, response):
        # 使用xpath定位到二手房信息的div元素,保存到列表中
        list_selector = response.xpath("//li/div[@class = 'info clear']")
        # 依次遍历每个选择器,获取二手房的名称,户型,面积,朝向等信息
        for one_selector in list_selector:
            try:
                name = one_selector.xpath("div[@class = 'title']/a/text()").extract_first()
                other = one_selector.xpath("div[@class = 'address']/div[@class = 'houseInfo']/text()").extract_first()
                other_list = other.split("|")
                type = other_list[0].strip(" ")
                area = other_list[1].strip(" ")
                direction = other_list[2].strip(" ")
                fitment = other_list[3].strip(" ")
                total_price = one_selector.xpath("//div[@class = 'totalPrice']/span/text()").extract_first()
                unit_price = one_selector.xpath("//div[@class = 'unitPrice']/@data-price").extract_first()
                url = one_selector.xpath("div[@class = 'title']/a/@href").extract_first()
                yield Request(url,meta={"name":name,"type":type,"area":area,"direction":direction,"fitment":fitment,"total_price":total_price,"unit_price":unit_price},callback=self.otherinformation)
            except:
                pass
        current_page = response.xpath("//div[@class = 'page-box house-lst-page-box']/@page-data").extract_first().split(',')[1].split(':')[1]
        current_page = current_page.replace("}", "")
        current_page = int(current_page)
        if current_page < 100:
            current_page += 1
            next_url = "https://bj.lianjia.com/ershoufang/pg%d/" %(current_page)
            yield Request(next_url,callback=self.otherinformation)
    def otherinformation(self,response):
        elevator = response.xpath("//div[@class = 'base']/div[@class = 'content']/ul/li[12]/text()").extract_first()
        property = response.xpath("//div[@class = 'transaction']/div[@class = 'content']/ul/li[5]/span[2]/text()").extract_first()
        item = LianjiaItem()
        item["name"] = response.meta['name']
        item["type"] = response.meta['type']
        item["area"] = response.meta['area']
        item["direction"] = response.meta['direction']
        item["fitment"] = response.meta['fitment']
        item["total_price"] = response.meta['total_price']
        item["unit_price"] = response.meta['unit_price']
        item["property"] = property
        item["elevator"] = elevator
        yield item

提示错误:

de - interpreting them as being unequal
  if item["direction"] == '鏆傛棤鏁版嵁':

2019-11-25 10:53:35 [scrapy.core.scraper] ERROR: Error processing {'area': u'75.6',
 'direction': u'\u897f\u5357',
 'elevator': u'\u6709',
 'fitment': u'\u7b80\u88c5',
 'name': u'\u6b64\u6237\u578b\u517113\u5957 \u89c6\u91ce\u91c7\u5149\u597d \u65e0\u786c\u4f24 \u4e1a\u4e3b\u8bda\u610f\u51fa\u552e',
 'property': u'\u6ee1\u4e94\u5e74',
 'total_price': None,
 'type': u'2\u5ba41\u5385',
 'unit_price': None}
Traceback (most recent call last):
  File "f:\python_3.6\venv\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "F:\python_3.6\lianjia\lianjia\pipelines.py", line 25, in process_item
    home_str = item['name']+","+item['type']+","+item['area']+","+item['direction']+","+item['fitment']+","+item['elevator']+","+item['total_price']+","+item['unit_price']+
","+item['property']+"\n"
TypeError: coercing to Unicode: need string or buffer, NoneType found

  • 写回答

2条回答 默认 最新

  • 关注
    评论

报告相同问题?

悬赏问题

  • ¥15 Pwm双极模式H桥驱动控制电机
  • ¥30 这是哪个作者做的宝宝起名网站
  • ¥60 版本过低apk如何修改可以兼容新的安卓系统
  • ¥25 由IPR导致的DRIVER_POWER_STATE_FAILURE蓝屏
  • ¥50 有数据,怎么建立模型求影响全要素生产率的因素
  • ¥50 有数据,怎么用matlab求全要素生产率
  • ¥15 TI的insta-spin例程
  • ¥15 完成下列问题完成下列问题
  • ¥15 C#算法问题, 不知道怎么处理这个数据的转换
  • ¥15 YoloV5 第三方库的版本对照问题