1.item
import scrapy
class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# 房屋名称
name = scrapy.Field()
# 房屋户型
type = scrapy.Field()
# 建筑面积
area = scrapy.Field()
# 房屋朝向
direction = scrapy.Field()
# 装修情况
fitment = scrapy.Field()
# 有无电梯
elevator = scrapy.Field()
# 房屋总价
total_price = scrapy.Field()
# 房屋单价
unit_price = scrapy.Field()
# 房屋产权
property = scrapy.Field()
2.settings
BOT_NAME = 'lianjia'
SPIDER_MODULES = ['lianjia.spiders']
NEWSPIDER_MODULE = 'lianjia.spiders'
USER_AGENT = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'lianjia.pipelines.FilterPipeline': 100,
'lianjia.pipelines.CSVPipeline': 200,
}
3.pipelines
import re
from scrapy.exceptions import DropItem
class FilterPipeline(object):
def process_item(self,item,spider):
item['area'] = re.findall(r"\d+\.?\d*",item["area"])[0]
if item["direction"] == '暂无数据':
raise DropItem("房屋朝向无数据,抛弃此项目:%s"%item)
return item
class CSVPipeline(object):
index = 0
file = None
def open_spider(self,spider):
self.file = open("home.csv","a")
def process_item(self, item, spider):
if self.index == 0:
column_name = "name,type,area,direction,fitment,elevator,total_price,unit_price,property\n"
self.file.write(column_name)
self.index = 1
home_str = item['name']+","+item['type']+","+item['area']+","+item['direction']+","+item['fitment']+","+item['elevator']+","+item['total_price']+","+item['unit_price']+","+item['property']+"\n"
self.file.write(home_str)
return item
def close_spider(self,spider):
self.file.close()
4.lianjia_spider
import scrapy
from scrapy import Request
from lianjia.items import LianjiaItem
class LianjiaSpiderSpider(scrapy.Spider):
name = 'lianjia_spider'
# 获取初始请求
def start_requests(self):
# 生成请求对象
url = 'https://bj.lianjia.com/ershoufang/'
yield Request(url)
# 实现主页面解析函数
def parse(self, response):
# 使用xpath定位到二手房信息的div元素,保存到列表中
list_selector = response.xpath("//li/div[@class = 'info clear']")
# 依次遍历每个选择器,获取二手房的名称,户型,面积,朝向等信息
for one_selector in list_selector:
try:
name = one_selector.xpath("div[@class = 'title']/a/text()").extract_first()
other = one_selector.xpath("div[@class = 'address']/div[@class = 'houseInfo']/text()").extract_first()
other_list = other.split("|")
type = other_list[0].strip(" ")
area = other_list[1].strip(" ")
direction = other_list[2].strip(" ")
fitment = other_list[3].strip(" ")
total_price = one_selector.xpath("//div[@class = 'totalPrice']/span/text()").extract_first()
unit_price = one_selector.xpath("//div[@class = 'unitPrice']/@data-price").extract_first()
url = one_selector.xpath("div[@class = 'title']/a/@href").extract_first()
yield Request(url,meta={"name":name,"type":type,"area":area,"direction":direction,"fitment":fitment,"total_price":total_price,"unit_price":unit_price},callback=self.otherinformation)
except:
pass
current_page = response.xpath("//div[@class = 'page-box house-lst-page-box']/@page-data").extract_first().split(',')[1].split(':')[1]
current_page = current_page.replace("}", "")
current_page = int(current_page)
if current_page < 100:
current_page += 1
next_url = "https://bj.lianjia.com/ershoufang/pg%d/" %(current_page)
yield Request(next_url,callback=self.otherinformation)
def otherinformation(self,response):
elevator = response.xpath("//div[@class = 'base']/div[@class = 'content']/ul/li[12]/text()").extract_first()
property = response.xpath("//div[@class = 'transaction']/div[@class = 'content']/ul/li[5]/span[2]/text()").extract_first()
item = LianjiaItem()
item["name"] = response.meta['name']
item["type"] = response.meta['type']
item["area"] = response.meta['area']
item["direction"] = response.meta['direction']
item["fitment"] = response.meta['fitment']
item["total_price"] = response.meta['total_price']
item["unit_price"] = response.meta['unit_price']
item["property"] = property
item["elevator"] = elevator
yield item
提示错误:
de - interpreting them as being unequal
if item["direction"] == '鏆傛棤鏁版嵁':
2019-11-25 10:53:35 [scrapy.core.scraper] ERROR: Error processing {'area': u'75.6',
'direction': u'\u897f\u5357',
'elevator': u'\u6709',
'fitment': u'\u7b80\u88c5',
'name': u'\u6b64\u6237\u578b\u517113\u5957 \u89c6\u91ce\u91c7\u5149\u597d \u65e0\u786c\u4f24 \u4e1a\u4e3b\u8bda\u610f\u51fa\u552e',
'property': u'\u6ee1\u4e94\u5e74',
'total_price': None,
'type': u'2\u5ba41\u5385',
'unit_price': None}
Traceback (most recent call last):
File "f:\python_3.6\venv\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "F:\python_3.6\lianjia\lianjia\pipelines.py", line 25, in process_item
home_str = item['name']+","+item['type']+","+item['area']+","+item['direction']+","+item['fitment']+","+item['elevator']+","+item['total_price']+","+item['unit_price']+
","+item['property']+"\n"
TypeError: coercing to Unicode: need string or buffer, NoneType found