机器学习之决策树
数据集为
代码如下
from numpy import *
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
# AcMat =['outlook', 'temperature', 'humidity','windy','play']
# AccMat = [['sunny','overcast','rainy'],['hot','mild','cool'],['high','normal'],['FALSE','TRUE'],['yes','no']]
AcMat =['age', 'prescription', 'astigmatic','tear','lenses']
AccMat = [['young','pre','presbyopic'],['myope','hyper'],['yes','no'],['reduced','normal'],['hard','soft','not']]
def loadDataSet(filename):
dataMat = []
fr = open(filename)
for line in fr.readlines():
lineArr = line.strip().split()
dataMat.append([lineArr[0], lineArr[1],lineArr[2],lineArr[3],lineArr[4]])
return dataMat
##第一个参数:
## 类型:矩阵,大小:N*1,目的:用于保存统计操作的来源
##第二个参数
## 类型:list,大小:1*M,目的:用于保存统计操作的分类来源
##功能:
##用第二个参数对第一个参数进行分类统计
##返回值:
##类型:list,大小:1*(M+1),根据第二参数对第一参数进行分类统计的统计值
def countIFA(dMat,clist):
m = len(clist) #计算分类个数
aaMat = [] #创建统计列表
for i in range(m): #初始化统计列表
aaMat.append(0)
line,n = shape(dMat) #取统计数据的个数
aaMat.append(line)
for i in range(line): #按数据分行统计
for j in range(m): #按类别进行归属
if(dMat[i][-1]) == clist[j]:
aaMat[j] +=1
return aaMat
##入口参数:
##类型:list,大小:1*N,最后一个元素的值是所有前面元素的值之和
##功能:
##计算列表的前N-1项的信息熵
##返回值:
##类型:float,列表的信息熵
def calcShannEnt(cMat) :
from math import log
shann = 0.0 #初始化熵为0.0
arrLen = len(cMat) #求列表的大小
i = 0
while i < arrLen-1:
pvalue = float(cMat[i] / cMat[arrLen - 1])
if pvalue > 0 :
shann -= pvalue * log(pvalue, 2)
i += 1
return shann
##入口参数:
##参数一:
##类型:list,大小:N*M,作用:过滤数据的来源
##参数二:
##类型:list,大小:,作用:过滤数据的二级分类类别
##参数三:
##类型:int,作用:过滤数据的一级分类类别索引
##参数四:
##类型:int,作用:过滤数据的二级分类类别索引
##返回值
##类型:list,值为包含有具体类别的数据列表,可能为空
def filter(dataMat,ccMat,km,kn) :
rm = [] #创建返回值list
m=shape(dataMat)[0] #求数据来源的行数
for i in range(m) :
if dataMat[i][km] == ccMat[km][kn] :
rm.append(dataMat[i])
return rm
"""
入口参数:
参数一:
类型:list,作用:数据来源
参数二:
类型:list,作用:二级分类类别
参数三:
类型:int,作用:一级类别索引
返回值:
返回第三参数指定类别的熵
"""
def calEnt(dataMat,ccMat,kn):
import numpy as np
mdataMat = mat(dataMat) #转换数据的类型属性,为countIFA函数做准备
countMat = countIFA(mdataMat[:, kn], ccMat[kn]) #统计指定的一级类别的各个二级类别的元素个数
countArr = array(countMat)
ArrLen = len(countArr)
if countArr[ArrLen-1] > 0:
probArr = countArr / countArr[ArrLen-1]
else:
probArr = countArr
probArr = probArr[:ArrLen-1] #计算指定的一级类别的各个二级类别元素的比重
##下面的代码用于求指定一级分类的每个二级分类的熵
calssMat = [] #用于保存每个二级类别的熵
cArrLen = len(ccMat[kn]) #求指定的一级类别的二级类别的个数
for i in range(cArrLen): #求定的一级类别的所有二级类别的熵
cdataMat = filter(dataMat,ccMat,kn,i) #按具体的二级类别对数据源过滤
if len(cdataMat) > 0 : #存在满足条件的数据,计算其熵
mcdataMat = mat(cdataMat)
ccountMat = countIFA(mcdataMat[:,-1],ccMat[-1])
calssMat.append(calcShannEnt(ccountMat))
else: #不存在满足条件的数据,直接将其熵设置为0.0
calssMat.append(0.0)
acalssMat = array(calssMat)
return np.dot(probArr,acalssMat) #返回指定一级类别的熵
"""
入口参数:
参数一:
类型:list,作用:数据来源
参数二:
类型:list,作用:一级分类
参数三:
类型:list,作用:二级分类
返回值:
类型:int,返回一个一级类别索引
"""
def BestFeature(dataMat,cMat,ccMat):
labelClass = -1 #默认最佳特征为“不存在”
minEnt = 9999999.99 #初始化最小信息熵
classLen = len(cMat) - 1 #求总特征个数
for i in range(classLen): #对每个特征计算其信息熵
temp = calEnt(dataMat,ccMat,i) #计算指定特征的信息熵
if temp < minEnt : #更新最小信息熵,并保存其所在索引
minEnt = temp
labelClass = i
return labelClass
def layer(dataMat,cMat,ccMat,nextLayer,bestFe,treeList):
import numpy as np
import copy
datList=[]
cList=[]
ccList=[]
bList=[]
CM = cMat.copy() # 复制一级分类特征表
CCM = ccMat.copy() # 复制二级分类特征表
CM.pop(bestFe) # 删除一级分类特征表中指定的最佳分类特征
CCM.pop(bestFe) # 删除二级分类特征表中指定的最佳分类特征
Branchs = len(ccMat[bestFe])
for branch in range(Branchs):
DM = dataMat.copy() #复制数据源
fdataMat = filter(DM,ccMat,bestFe,branch) #求得最佳特征的二级分类后的数据源
if len(fdataMat) != 0: #对数据源进行清理
m = shape(fdataMat)[0]
for i in range(m):
fdataMat[i].pop(bestFe) #删除数据源中最佳特征的数据项
datList.append(fdataMat)
cList.append(CM)
ccList.append(CCM)
bList.append(branch)
brans = len(datList)
if brans == 0:
return
for bran in range(brans):
fdataMat = datList[bran]
CM = cList[bran]
CCM = ccList[bran]
branch = bList[bran]
ffdataMat = mat(fdataMat)
ccountMat = countIFA(ffdataMat[:, -1], CCM[-1])
Ent = calcShannEnt(ccountMat)
if Ent == 0:
# print(nextLayer,branch,fdataMat[-1][-1],ccMat[bestFe][branch],"leaf")
treeList.append([nextLayer,Name2Index(cMat[bestFe]),branch,fdataMat[-1][-1],ccMat[bestFe][branch],"leaf"])
else:
newbestFe = BestFeature(fdataMat, CM, CCM)
if newbestFe != -1 :
# print(nextLayer,branch,CM[newbestFe],ccMat[bestFe][branch])
treeList.append([nextLayer,Name2Index(cMat[bestFe]),branch,CM[newbestFe],ccMat[bestFe][branch]])
layer(fdataMat,CM,CCM,nextLayer+1,newbestFe,treeList)
return
def CreateTree(filename):
# dataMat = loadDataSet('test.txt')
treeList=[]
dataMat = loadDataSet(filename)
cMat = AcMat
ccMat = AccMat
bf = BestFeature(dataMat,cMat,ccMat)
if bf != -1 :
# print(0,0,cMat[bf])
treeList.append([0,0,0,cMat[bf]])
nextlayer = 1
layer(dataMat,cMat,ccMat,nextlayer,bf,treeList)
return treeList
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=0)
def createPlot(treeList,grid):
fig = plt.figure(1, facecolor='white')
fig.clf()
createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
temptl = mat(treeList)
row = shape(temptl)[1]
lay = shape(mat(grid))[0]
laygrid = 1.0/lay
for i in range(row):
ly = int(treeList[i][0])
grid[ly][0] += 1
if i == 0:
plotNode(treeList[i][3], (grid[ly][1] * grid[ly][0], 1 - laygrid * ly), (grid[ly][1] * grid[ly][0], 1 - laygrid * ly),
decisionNode)
elif treeList[i][-1] =='leaf':
plotNode(treeList[i][3], (grid[ly][1]*grid[ly][0], 1-laygrid*ly), (grid[ly-1][1]*grid[ly-1][0], 1-laygrid*(ly-1)), leafNode)
plotMidText([grid[ly][1]*grid[ly][0], 1-laygrid*ly],[grid[ly-1][1]*grid[ly-1][0], 1-laygrid*(ly-1)],treeList[i][4])
else:
plotNode(treeList[i][3], (grid[ly][1]*grid[ly][0], 1-laygrid*ly), (grid[ly-1][1]*grid[ly-1][0], 1-laygrid*(ly-1)), decisionNode)
plotMidText([grid[ly][1] * grid[ly][0], 1 - laygrid * ly],
[grid[ly - 1][1] * grid[ly - 1][0], 1 - laygrid * (ly - 1)], treeList[i][4])
plt.show()
def GetGrid(treeList):
grid = []
temptl = mat(treeList)
ly = 0
row = shape(temptl)[1]
for i in range(row):
if treeList[i][0] > ly:
ly = treeList[i][0]
for i in range(ly+1):
grid.append([0,0])
for i in range(row):
t= treeList[i][0]
grid[t][0] += 1
for i in range(ly+1):
grid[i][1] = 1.0 / (grid[i][0] + 1)
grid[i][0] = 0
return grid
def Name2Index(NameStr):
index = -2
col = len(AcMat)
for i in range(col):
if NameStr == AcMat[i]:
index = i
return index
def FindextoBranch(index,item):
tempt = mat(item)
row = shape(tempt)[0]
for i in range(row):
if item[i][0]==index:
return item[i]
def treefilter(tree,node):
tempt = mat(tree)
row = shape(tempt)[1]
for i in range(row):
if (tree[i][0]==node[0] and tree[i][1]==node[1] and tree[i][2]==node[2]):
return tree[i]
def test(tree,item):
ItemIndex = []
for k in range(len(item)):
for i in range(len(AcMat)):
for j in range(len(AccMat[i])):
if item[k] == AccMat[i][j]:
ItemIndex.append([i,j])
node = [0,0,0]
while(True):
bran = treefilter(tree, node)
if bran[-1] == 'leaf':
return bran[3]
else:
node[0] = bran[0]+1
findex = Name2Index(bran[3])
btemp = FindextoBranch(findex,ItemIndex)
node[1] = btemp[0]
node[2] = btemp[1]
def Judge(treeList,item):
return test(treeList, item)
def main():
treeList = CreateTree('lenses.txt')
grid = GetGrid(treeList)
createPlot(treeList,grid)
# c = 0
# dataMat = loadDataSet('lenses.txt')
# t = mat(dataMat)
# row = shape(t)[0]
# for i in range(row):
# if test(treeList,dataMat[i]) == dataMat[i][-1]:
# c += 1
# print((c/row)*100)
item = ['young','hyper','no','normal']
print(Judge(treeList, item))
main()
运行结果为
但正确结果应为
使用的环境为pycharm 3.7
请大家帮我看看哪里有问题,谢谢!