哈哈贞贞 2016-04-05 13:29 采纳率: 50%
浏览 1897

决策树ID3python编程遇到的问题

各位大神,小弟入门数据分析现在,python也就会个基础语法,以下是照着机器学习的代码敲的,数据集是自己编的,但是运行结果报错:unhashable type: 'list',希望大神可以给我讲解一下,谢谢啦

from math import log
import operator
def CreateDataset():
dataset=[[0,1,1,'high'],
[0,1,1,'high'],
[0,1,1,'high'],
[0,0,1,'high'],
[0,1,1,'high'],
[0,0,1,'high'],
[0,1,0,'high'],
[1,1,1,'high'],
[1,1,0,'high'],
[1,1,1,'high'],
[1,1,1,'high'],
[1,1,1,'high'],
[1,1,1,'high'],
[0,1,1,'high'],
[1,0,1,'high'],
[1,0,1,'high'],
[1,0,1,'high'],
[1,0,1,'high'],
[1,0,0,'high'],
[0,0,0,'high'],
[0,0,1,'low'],
[0,0,1,'low'],
[0,0,1,'low'],
[0,0,0,'low'],
[0,1,0,'low'],
[1,0,1,'low'],
[1,0,1,'low'],
[0,0,0,'low'],
[0,0,0,'low'],
[1,0,0,'low'],
[0,1,0,'low'],
[1,0,1,'low'],
[1,0,0,'low'],
[1,0,0,'low']]
labels=['weather','weekend','sales','volumes']
return dataset,labels
def calcShannonEnt(dataset):
numEntries=len(dataset)
labelCounts={}
for featVec in dataset:
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries
shannonEnt-=prob *log(prob,2)
return shannonEnt

def splitDataSet(dataset,axis,value):
retDataSet=[]
for featVec in dataset:
if featVec[axis]==value:
reduceFeatVec=featVec[:axis]
reduceFeatVec.extend(featVec[axis+1:])
retDataSet.append(reduceFeatVec)
return retDataSet

def chooseBestFeatureToSplit(dataset):
numFeatures=len(dataset[0])-1
baseEntropy=calcShannonEnt(dataset)
bestInfoGain=0.0
bestFeature=-1
for i in range(numFeatures):
featList=[example[i] for example in dataset ]
uniqueVals=set(featList)
newEntropy=0.0
for value in uniqueVals:
subDataSet=splitDataSet(dataset,i,value)
prob=len(subDataSet)/float(len(dataset))
newEntropy +=prob * calcShannonEnt(subDataSet)
infoGain=baseEntropy-newEntropy
if(infoGain>bestInfoGain):
bestInfoGain=infoGain
bestFeature=i
return bestFeature

def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]

def createTree(dataset,labels):
classList=[example[-1] for example in dataset]
if classList.count(classList[0])==len(classList):
return classList
if len(dataset[0])==1:
return majorityCnt(dataset)
bestFeat=chooseBestFeatureToSplit(dataset)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
del(labels[bestFeat])
featValues=[example[bestFeat] for example in dataset]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataset,bestFeat,value),subLabels)
return myTree

myDat,labels = CreateDataset()
print(calcShannonEnt(myDat))

print(splitDataSet(myDat, 1, 1))

print(chooseBestFeatureToSplit(myDat))

print(createTree(myDat, labels))

  • 写回答

1条回答 默认 最新

  • threenewbee 2016-04-05 19:49
    关注
    评论

报告相同问题?

悬赏问题

  • ¥15 oracle集群安装出bug
  • ¥15 关于#python#的问题:自动化测试
  • ¥20 问题请教!vue项目关于Nginx配置nonce安全策略的问题
  • ¥15 教务系统账号被盗号如何追溯设备
  • ¥20 delta降尺度方法,未来数据怎么降尺度
  • ¥15 c# 使用NPOI快速将datatable数据导入excel中指定sheet,要求快速高效
  • ¥15 再不同版本的系统上,TCP传输速度不一致
  • ¥15 高德地图2.0 版本点聚合中Marker的位置无法实时更新,如何解决呢?
  • ¥15 DIFY API Endpoint 问题。
  • ¥20 sub地址DHCP问题