
- 代码实现决策树(ID3算法)
- id3简介
- 代码实现
- sklearn实现决策树算法
- 参考
ID3 (Iterative Dichotomiser 3)由罗斯·昆兰于1986年开发。该算法创建多路树,为每个节点找到绝对特征,这将为目标带来最大的信息收益。树被到其最大大小,然后通过修剪步骤提高树的通用能力。
代码实现# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
from collections import Counter
from math import log2
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签
plt.rcParams['axes.unicode_minus']=False #这两行需要手动设置
def getData(filePath):
data = pd.read_csv(filePath)
return data
def dataDeal(data):
dataList = np.array(data).tolist()
dataSet = [element[1:] for element in dataList]
return dataSet
#树可视化展示
# 设置决策节点和叶节点的边框形状、边距和透明度,以及箭头的形状
decisionNode = dict(boxstyle="square,pad=0.5", fc="0.9")
leafNode = dict(boxstyle="round4, pad=0.5", fc="0.9")
arrow_args = dict(arrowstyle="<-", connectionstyle="arc3", shrinkA=0,
shrinkB=16)
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt,
xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="top", ha="center", bbox=nodeType,
arrowprops=arrow_args)
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])
else:
numLeafs += 1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString)
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW,
plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
plotTree(secondDict[key], cntrPt, str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff),
cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW
plotTree.yOff = 1.0
plotTree(inTree, (0.5, 1.0), '')
plt.show()
#获取属性名称
def getLabels(data):
labels = list(data.columns)[1:-1]
return labels
#获取类别标记
def targetClass(dataSet):
classification = set([element[-1] for element in dataSet])
return classification
#将分支结点标记为叶结点,选择样本数最多的类作为类标记
def majorityRule(dataSet):
mostKind = Counter([element[-1] for element in dataSet]).most_common(1)
majorityKind = mostKind[0][0]
return majorityKind
#计算信息熵
def infoEntropy(dataSet):
classColumnCnt = Counter([element[-1] for element in dataSet])
Ent = 0
for symbol in classColumnCnt:
p_k = classColumnCnt[symbol]/len(dataSet)
Ent = Ent-p_k*log2(p_k)
return Ent
#子数据集构建
def makeAttributeData(dataSet,value,iColumn):
attributeData = []
for element in dataSet:
if element[iColumn]==value:
row = element[:iColumn]
row.extend(element[iColumn+1:])
attributeData.append(row)
return attributeData
#计算信息增益
def infoGain(dataSet,iColumn):
Ent = infoEntropy(dataSet)
tempGain = 0.0
attribute = set([element[iColumn] for element in dataSet])
for value in attribute:
attributeData = makeAttributeData(dataSet,value,iColumn)
tempGain = tempGain+len(attributeData)/len(dataSet)*infoEntropy(attributeData)
Gain = Ent-tempGain
return Gain
#选择最优属性
def selectOptimalAttribute(dataSet,labels):
bestGain = 0
sequence = 0
for iColumn in range(0,len(labels)):#不计最后的类别列
Gain = infoGain(dataSet,iColumn)
if Gain>bestGain:
bestGain = Gain
sequence = iColumn
# print(labels[iColumn],Gain)
return sequence
#建立决策树
def createTree(dataSet,labels):
classification = targetClass(dataSet) #获取类别种类(集合去重)
if len(classification) == 1:
return list(classification)[0]
if len(labels) == 1:
return majorityRule(dataSet)#返回样本种类较多的类别
sequence = selectOptimalAttribute(dataSet,labels)
# print(labels)
optimalAttribute = labels[sequence]
del(labels[sequence])
myTree = {optimalAttribute:{}}
attribute = set([element[sequence] for element in dataSet])
for value in attribute:
# print(myTree)
# print(value)
subLabels = labels[:]
myTree[optimalAttribute][value] =
createTree(makeAttributeData(dataSet,value,sequence),subLabels)
return myTree
def main():
filePath = 'D:Downloadwatermalon.csv'
data = getData(filePath)
dataSet = dataDeal(data)
labels = getLabels(data)
myTree = createTree(dataSet,labels)
return myTree
if __name__ == '__main__':
myTree = main()
print(myTree)
createPlot(myTree)
{'纹理': {'稍糊': {'触感': {'软粘': '是', '硬滑': '否'}}, '模糊': '否', '清晰': {'根蒂': {'蜷缩': '是', '稍蜷': {'触感': {'软粘': {'敲声': {'浊响': '是'}}, '硬滑': '是'}}, '硬挺': '否'}}}}
sklearn实现决策树算法
代码如下
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
def watermalon():
"""
西瓜数据集
:return: None
"""
# 获取文件,处理数据
watermalon = pd.read_csv("D:Downloadwatermalon.csv")
# 处理数据,找出特征值和目标值
x = watermalon[["色泽","根蒂","敲声","纹理","脐部","触感"]]#获取文件中特征值为pclass,age,sex的三列
y = watermalon["好瓜"]#获取是否存活的结果
print(x)#打印输出看看效果
print("__________________________________")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 进行特征工程的处理,将数据集变换成one-hot形式
dict = DictVectorizer(sparse=False)
#这里涉及到.to_dict(orient="records")方法,就是将列表转换为一个一个字典形式,因为DictVectorizer()中接收的是字典形式
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
#print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient="records"))
#用决策树进行预测,这里就是决策树api
#DecisionTreeClassifier默认采用基尼指数作为特征选择标准。如需使用信息熵作为标准,可添加参数criterion='entropy')
#DecisionTreeClassifier构建决策树没有剪枝的步骤,若决策树出现了过拟合现象,可以适当的减少训练数据
#以下为官方网站相关介绍
# criterion{“gini”, “entropy”}, default=”gini”
# The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.
tre=DecisionTreeClassifier(criterion='entropy')
tre.fit(x_train,y_train)
#输出准确度
print(tre.score(x_test, y_test))
return None
if __name__ == '__main__':
watermalon()
色泽 根蒂 敲声 纹理 脐部 触感 0 青绿 蜷缩 浊响 清晰 凹陷 硬滑 1 乌黑 蜷缩 沉闷 清晰 凹陷 硬滑 2 乌黑 蜷缩 浊响 清晰 凹陷 硬滑 3 青绿 蜷缩 沉闷 清晰 凹陷 硬滑 4 浅白 蜷缩 浊响 清晰 凹陷 硬滑 5 青绿 稍蜷 浊响 清晰 稍凹 软粘 6 乌黑 稍蜷 浊响 稍糊 稍凹 软粘 7 乌黑 稍蜷 浊响 清晰 稍凹 硬滑 8 乌黑 稍蜷 沉闷 稍糊 稍凹 硬滑 9 青绿 硬挺 清脆 清晰 平坦 软粘 10 浅白 硬挺 清脆 模糊 平坦 硬滑 11 浅白 蜷缩 浊响 模糊 平坦 软粘 12 青绿 稍蜷 浊响 稍糊 凹陷 硬滑 13 浅白 稍蜷 沉闷 稍糊 凹陷 硬滑 14 乌黑 稍蜷 浊响 清晰 稍凹 软粘 15 浅白 蜷缩 浊响 模糊 平坦 硬滑 16 青绿 蜷缩 沉闷 稍糊 稍凹 硬滑 __________________________________ 0.6参考
sklearn网站
机器学习笔记(4)——ID3决策树算法及其Python实现
python机器学习sklearn之决策树
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)