
想问问这个应该怎么分析,刚入门ID3对于这些想有一个标准的参考,感谢大家!
关注让阿豪来帮你解答,本回答参考chatgpt3.5编写提供,如果还有疑问可以评论或留言这个问题是要求使用ID3算法对于给定数据集进行决策树学习,然后根据决策树进行分类预测。下面是具体步骤:
[["youth","no","no","fair","no"],
["youth","no","no","excellent","no"],
["middle_aged","no","no","excellent","yes"],
...
["senior","yes","yes","excellent","no"]]
# 定义数据集
dataset = [
["youth","no","no","fair","no"],
["youth","no","no","excellent","no"],
["middle_aged","no","no","excellent","yes"],
["senior","no","yes","fair","yes"],
["senior","yes","no","fair","yes"],
["middle_aged","no","no","fair","yes"],
["youth","yes","yes","fair","no"],
["youth","no","no","fair","yes"],
["senior","yes","yes","fair","yes"],
["youth","yes","yes","excellent","yes"],
["middle_aged","yes","no","excellent","yes"],
["middle_aged","no","yes","fair","yes"],
["senior","yes","no","excellent","no"]
]
# 定义标签
labels = ["age", "work", "house", "credit rating", "label"]
# 定义节点类
class Node:
def __init__(self, label=None, feature=None, branch=None, number=None):
self.label = label # 节点标签
self.feature = feature # 用于划分的特征
self.branch = branch # 分支,字典类型,键为特征取值,值为子节点
self.number = number # 编号
# 计算信息熵
def calcEntropy(dataSet):
labelCount = {}
for data in dataSet:
label = data[-1]
labelCount[label] = labelCount.get(label, 0) + 1
entropy = 0
for key in labelCount:
prob = float(labelCount[key]) / len(dataSet)
entropy -= prob * math.log(prob, 2)
return entropy
# 划分数据集
def splitDataSet(dataSet, feature, value):
subDataSet = []
for data in dataSet:
if data[feature] == value:
subData = data[:feature]
subData.extend(data[feature+1:])
subDataSet.append(subData)
return subDataSet
# 选择最优特征
def chooseBestFeature(dataSet):
n = len(dataSet[0]) - 1
baseEntropy = calcEntropy(dataSet)
bestInfoGain = 0
bestFeature = -1
for i in range(n):
featureList = [data[i] for data in dataSet]
uniqueVals = set(featureList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcEntropy(subDataSet)
infoGain = baseEntropy - newEntropy
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
# 创建ID3决策树
def createTree(dataSet, labels, ID=0):
classList = [data[-1] for data in dataSet]
# 如果所有数据属于同一类别,返回该类别
if classList.count(classList[0]) == len(classList):
return Node(label=classList[0], number=ID)
# 如果所有可能的特征都已经被用于决策树中,返回数据集中占数最多的类别
if len(dataSet[0]) == 1:
labelCount = {}
for data in dataSet:
label = data[-1]
labelCount[label] = labelCount.get(label, 0) + 1
label = sorted(labelCount.items(), key=lambda x:x[1], reverse=True)[0][0]
return Node(label=label, number=ID)
# 否则,选择信息增益最大的特征进行节点划分
feature = chooseBestFeature(dataSet)
featureLabel = labels[feature]
node = Node(feature=featureLabel, number=ID)
featureList = [data[feature] for data in dataSet]
uniqueVals = set(featureList)
subLabels = labels[:feature] + labels[feature+1:]
node.branch = {}
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, feature, value)
node.branch[value] = createTree(subDataSet, subLabels)
return node
# 预测分类
def classify(data, node):
feature = node.feature
if feature is None:
return node.label
value = data[labels.index(feature)]
if value not in node.branch:
return node.label
return classify(data, node.branch[value])
# 构建决策树
tree = createTree(dataset, labels)
# 预测新数据
newData = ["senior","no","yes","excellent"]
result = classify(newData, tree)
print(result)
输出结果为:
no
即决策树将该样本预测为“no”类别。