周志华机器学习第三章决策树ID-3和CART代码实现,搜索到的代码结果和教材结果不一致,在现有代码上更改后与教材结果一致,现有代码在实现时存在问题。
代码以及环境安装要求放在我的GitHub上了:https://github.com/XTWLP/descion_tree
ID-3结果 CART结果- 未剪枝
- 预剪枝
- 后剪枝
代码放在我的GitHub上了:https://github.com/XTWLP/descion_tree
## 1.模型代码
### 1.1 决策树各节点定义
# -*- coding: utf-8 -*
## 定义节点
class Node(object):
def __init__(self, attr_init=None, label_init=None, attr_down_init={} ):
self.attr = attr_init
self.label = label_init
self.attr_down = attr_down_init
### 1.2 决策树实现流程
#构建树的函数(训练函数)
import pandas as pd
data_file_encode = "gb18030"
with open("watermelon_3.csv", mode = 'r', encoding = data_file_encode) as data_file:
data = pd.read_csv(data_file)
def TreeGenerate(df):
'''
Branching for decision tree using recursion
@param df: the pandas dataframe of the data_set
@return root: Node, the root node of decision tree
'''
# generating a new root node
new_node = Node(None, None, {})
label_arr = df[df.columns[-1]]
label_count = NodeLabel(label_arr)
if label_count: # assert the label_count isn's empty
new_node.label= max(label_count, key=label_count.get)
# end if there is only 1 class in current node data
# end if attribution array is empty
if len(label_count) == 1 or len(label_arr) == 0:
return new_node
# get the optimal attribution for a new branching
new_node.attr, div_value = OptAttr(df) # via Gini index
# recursion
if div_value == 0: # categoric variable
value_count = ValueCount(data[new_node.attr])
for value in value_count:
df_v = df[ df[new_node.attr].isin([value]) ] # get sub set
# delete current attribution
df_v = df_v.drop(new_node.attr, 1)
if len(df_v) == 0:
new_node.attr_down[value] = Node(None,new_node.label, {})
return new_node
else:
new_node.attr_down[value] = TreeGenerate(df_v) #迭代创立下一级节点
else: # continuous variable # left and right child
value_l = "<=%.3f" % div_value
value_r = ">%.3f" % div_value
df_v_l = df[ df[new_node.attr] <= div_value ] # get sub set
df_v_r = df[ df[new_node.attr] > div_value ]
new_node.attr_down[value_l] = TreeGenerate(df_v_l)
new_node.attr_down[value_r] = TreeGenerate(df_v_r)
return new_node
### 1.3 决策树的预测
'''
根据根进行预测
@param root:节点,决策树的根节点
@param df_sample:数据框,样本行
'''
def Predict(root, df_sample):
try:
import re # 使用正则表达式获取字符串中的数字
except ImportError :
print("module re not found")
while root.attr != None :
# 连续变量
if df_sample[root.attr].dtype == float :
# 从 root.attr_down 获取 div_value
for key in list(root.attr_down):
num = re.findall(r"\d+\.?\d*",key)
div_value = float(num[0])
break
if df_sample[root.attr].values[0] <= div_value:
key = "<=%.3f" % div_value
root = root.attr_down[key]
else:
key = ">%.3f" % div_value
root = root.attr_down[key]
# 分类变量
else:
key = df_sample[root.attr].values[0]
# 检查子分支中的属性是否
if key in root.attr_down:
root = root.attr_down[key]
else:
break
return root.label
### 1.4 对目标和属性的取值进行保存和计数
'''
计算目标类别及其计数
@param label_arr:目标类别的数据数组
@return label_count: dict,出现的目标各类和它的计数
'''
def NodeLabel(label_arr):
label_count = {} # 存储目标各类别数量
for label in label_arr:
if label in label_count: label_count[label] += 1
else: label_count[label] = 1
return label_count
'''
计算分类属性的出现值及其计数
@param data_arr:属性的数据数组
@return value_count: dict,出现的值和它的计数
'''
def ValueCount(data_arr):
value_count = {} # store count of value
for label in data_arr:
if label in value_count: value_count[label] += 1
else: value_count[label] = 1
return value_count
### 1.5 根据信息增益选择当前数据集下最优属性
'''
找到当前数据集的最优属性
@param df:data_set 的 pandas 数据框
@return opt_attr:分支的最佳属性
@return div_value:对于离散变量值为0 对于连续变量值为t值为二分法的划分值
'''
def OptAttr(df):
info_gain = 0
for attr_id in df.columns[1:-1]:
info_gian_tmp, div_value_tmp = InfoGain(df, attr_id) # 调用信息增益计算函数
if info_gian_tmp > info_gain : # 找到增益最大的那个属性和划分值
info_gain = info_gian_tmp
opt_attr = attr_id
div_value = div_value_tmp
return opt_attr, div_value
### 1.6 计算信息增益(连续数据采用二分法实现)
'''
ID3算法采用信息增益最大化来实现最优划分属性的选择,这里主要的挑战是离散和连续两种属性变量的分别 *** 作。
对于离散变量(categoric variable),参考书p75-77内容实现,
对于连续变量(continuous variable),采用书p83-85所介绍的二分法实现。
相关内容如信息熵、信息增益最大化、二分法等可参考书p75-76及p84页内容。
'''
def InfoGain(df, index):
info_gain = InfoEnt(df.values[:,-1]) # 所有类别的信息熵
div_value = 0 # 连续属性的分割值(二分法)
n = len(df[index]) # 数据集的长度
# 1.对连续变量使用二分法
if df[index].dtype == float:
sub_info_ent = {} # 存储分割值及其子集熵
df = df.sort_values([index], ascending=1) # 对数值进行排序
df = df.reset_index(drop=True)
data_arr = df[index]
label_arr = df[df.columns[-1]]
for i in range(n-1):
div = (data_arr[i] + data_arr[i+1]) / 2
sub_info_ent[div] = ( (i+1) * InfoEnt(label_arr[0:i+1]) / n ) \
+ ( (n-i-1) * InfoEnt(label_arr[i+1:-1]) / n )
# 我们的目标是获得最小子集熵和它的划分值在p84公式
div_value, sub_info_ent_max = min(sub_info_ent.items(), key=lambda x: x[1])
info_gain -= sub_info_ent_max
# 2.对于离散变量(分类变量)直接计算信息增益
else:
data_arr = df[index]
label_arr = df[df.columns[-1]]
value_count = ValueCount(data_arr)
for key in value_count:
key_label_arr = label_arr[data_arr == key]
info_gain -= value_count[key] * InfoEnt(key_label_arr) / n
return info_gain, div_value
### 1.7 计算信息熵
'''
计算属性的信息熵
'''
def InfoEnt(label_arr):
try :
from math import log2
except ImportError :
print("module math.log2 not found")
ent = 0
n = len(label_arr)
label_count = NodeLabel(label_arr) # label_count {'是': 8, '否': 9}
for key in label_count:
ent -= ( label_count[key] / n ) * log2( label_count[key] / n )
return ent
### 1.8 将树构建为图,然后进行可视化
def TreeToGraph(i, g, root):
'''
从根开始构建图
@param i:这棵树中的节点号
@param g: pydotplus.graphviz.Dot() 对象
@param root:根节点 @return i:修改后的节点号
@return g: 修改后的 pydotplus.graphviz.Dot() 对象
@return g_node:graphviz 中的当前根节点
'''
try:
from pydotplus import graphviz
except ImportError:
print("module pydotplus.graphviz not found")
if root.attr == None:
g_node_label = "Node:%d\n好瓜:%s" % (i, root.label)
else:
g_node_label = "Node:%d\n好瓜:%s\n属性:%s" % (i, root.label, root.attr)
g_node = i
g.add_node(graphviz.Node(g_node, label=g_node_label, fontname="FangSong"))
for value in list(root.attr_down):
i, g_child = TreeToGraph(i + 1, g, root.attr_down[value])
g.add_edge(graphviz.Edge(g_node, g_child, label=value, fontname="FangSong"))
return i, g_node
def DrawPNG(root, out_file):
'''
从根开始可视化决策树
@变量root:节点,树的根节点。
@变量out_file: str,输出文件的名称和路径
'''
try:
from pydotplus import graphviz
except ImportError:
print("module pydotplus.graphviz not found")
g = graphviz.Dot() # generation of new dot
TreeToGraph(0, g, root)
g2 = graphviz.graph_from_dot_data( g.to_string() )
g2.write_png(out_file)
## 2. ID-3决策树模型使用
### 2.1 导入数据,并使用决策树模型
#导入包和数据
import matplotlib.pyplot as plt
import pandas as pd
data_file_encode = "gb18030"
with open("watermelon_3.csv", mode = 'r', encoding = data_file_encode) as data_file:
df = pd.read_csv(data_file)
root = TreeGenerate(df)
### 2.2 k折交叉验证 计算模型的泛化能力
accuracy_scores = []
# k折交叉验证
n = len(df.index)
k = 10
for i in range(k):
m = int(n/k)
test = []
for j in range(i*m, i*m+m):
test.append(j) # 随机抽取m个测试集
df_train = df.drop(test) #划分训练集和测试集
df_test = df.iloc[test]
root = TreeGenerate(df_train) #训练并生成决策树
# 计算决策树在测试集上的精度
pred_true = 0
for i in df_test.index:
label = Predict(root, df[df.index == i])
if label == df_test[df_test.columns[-1]][i]:
pred_true += 1
accuracy = pred_true / len(df_test.index)
accuracy_scores.append(accuracy)
### 2.3 计算平均精度
# 计算平均精度
accuracy_sum = 0
print("accuracy: ", end = "")
for i in range(k):
print("%.3f " % accuracy_scores[i], end = "")
accuracy_sum += accuracy_scores[i]
print("\naverage accuracy: %.3f" % (accuracy_sum/k))
root = TreeGenerate(df) # 用 pydotplus.graphviz函数实现可视化
DrawPNG(root, "decision_tree_ID3.png")
CART决策树模型
# 一、决策树的构建
## 1. 节点定义
#定义节点
class Node(object):
'''
definition of decision node class
attr: attribution as parent for a new branching
attr_down: dict: {key, value}
key: categoric: categoric attr_value
continuous: '<= div_value' for small part
'> div_value' for big part
value: children (Node class)
label: class label (the majority of current sample labels)
'''
def __init__(self, attr_init=None, label_init=None, attr_down_init={} ):
self.attr = attr_init
self.label = label_init
self.attr_down = attr_down_init
## 2.树的构建
### (1)建树中用到的函数
#计算样本的类的数量(好瓜/坏瓜),及其计数
def NodeLabel(label_arr):
'''
calculating the appeared label and it's counts
@param label_arr: data array for class labels
@return label_count: dict, the appeared label and it's counts
'''
label_count = {} # store count of label
for label in label_arr:
if label in label_count: label_count[label] += 1
else: label_count[label] = 1
return label_count
#得到某个属性的所有取值和取到该值的样本数量
def ValueCount(data_arr):
'''
calculating the appeared value for categoric attribute and it's counts
@param data_arr: data array for an attribute
@return value_count: dict, the appeared value and it's counts
'''
value_count = {} # store count of value
for label in data_arr:
if label in value_count: value_count[label] += 1
else: value_count[label] = 1
return value_count
### (2)树的构建及其功能函数(预测,计算精度)
#构建树的函数(训练函数)
import pandas as pd
data_file_encode = "gb18030"
with open("watermelon_2.csv", mode = 'r', encoding = data_file_encode) as data_file:
data = pd.read_csv(data_file)
def TreeGenerate(df):
'''
Branching for decision tree using recursion
@param df: the pandas dataframe of the data_set
@return root: Node, the root node of decision tree
'''
# generating a new root node
new_node = Node(None, None, {})
label_arr = df[df.columns[-1]]
label_count = NodeLabel(label_arr)
if label_count: # assert the label_count isn's empty
new_node.label= max(label_count, key=label_count.get)
# end if there is only 1 class in current node data
# end if attribution array is empty
if len(label_count) == 1 or len(label_arr) == 0:
return new_node
# get the optimal attribution for a new branching
new_node.attr, div_value = OptAttr_Ent(df) # via Gini index
# recursion
if div_value == 0: # categoric variable
value_count = ValueCount(data[new_node.attr])
for value in value_count:
df_v = df[ df[new_node.attr].isin([value]) ] # get sub set
# delete current attribution
df_v = df_v.drop(new_node.attr, 1)
if len(df_v) == 0:
new_node.attr_down[value] = Node(None,new_node.label, {})
return new_node
else:
new_node.attr_down[value] = TreeGenerate(df_v) #迭代创立下一级节点
else: # continuous variable # left and right child
value_l = "<=%.3f" % div_value
value_r = ">%.3f" % div_value
df_v_l = df[ df[new_node.attr] <= div_value ] # get sub set
df_v_r = df[ df[new_node.attr] > div_value ]
new_node.attr_down[value_l] = TreeGenerate(df_v_l)
new_node.attr_down[value_r] = TreeGenerate(df_v_r)
return new_node
#预测函数
def Predict(root, df_sample):
'''
make a predict based on root
@param root: Node, root Node of the decision tree
@param df_sample: dataframe, a sample line
'''
try :
import re # using Regular Expression to get the number in string
except ImportError :
print("module re not found")
while root.attr != None :
# continuous variable
if df_sample[root.attr].dtype == float:
# get the div_value from root.attr_down
for key in list(root.attr_down):
num = re.findall(r"\d+\.?\d*",key)
div_value = float(num[0])
break
if df_sample[root.attr].values[0] <= div_value:
key = "<=%.3f" % div_value
root = root.attr_down[key]
else:
key = ">%.3f" % div_value
root = root.attr_down[key]
# categoric variable
else:
key = df_sample[root.attr].values[0]
# check whether the attr_value in the child branch
if key in root.attr_down:
root = root.attr_down[key]
else:
break
return root.label
#计算预测精确度的函数
def PredictAccuracy(root, df_test):
'''
calculating accuracy of prediction on test set
@param root: Node, root Node of the decision tree
@param df_test: dataframe, test data set
@return accuracy, float,
'''
if len(df_test.index) == 0: return 0
pred_true = 0
for i in df_test.index:
label = Predict(root, df_test[df_test.index == i])
if label == df_test[df_test.columns[-1]][i]:
pred_true += 1
return pred_true / len(df_test.index)
### (3)信息增益和信息熵的计算
'''
optimal attribution selection in ID3 algorithm based on information entropy
'''
#挑选信息增益最大的属性
def OptAttr_Ent(df):
'''
find the optimal attributes of current data_set based on info entropy
@param df: the pandas dataframe of the data_set
@return opt_attr: the optimal attribution for branch
@return div_value: for discrete variable value = 0
for continuous variable value = t for bisection divide value
'''
info_gain = 0
for attr_id in df.columns[1:-1]:
info_gian_tmp, div_value_tmp = InfoGain(df, attr_id)
if info_gian_tmp > info_gain :
info_gain = info_gian_tmp
opt_attr = attr_id
div_value = div_value_tmp
return opt_attr, div_value
#计算属性的信息增益
def InfoGain(df, attr_id):
'''
calculating the information gain of an attribution
@param df: dataframe, the pandas dataframe of the data_set
@param attr_id: the target attribution in df
@return info_gain: the information gain of current attribution
@return div_value: for discrete variable, value = 0
for continuous variable, value = t (the division value)
'''
info_gain = InfoEnt(df.values[:,-1]) # info_gain for the whole label
div_value = 0 # div_value for continuous attribute
n = len(df[attr_id]) # the number of sample
# 1.for continuous variable using method of bisection
if df[attr_id].dtype == float:
sub_info_ent = {} # store the div_value (div) and it's subset entropy
df = df.sort_values([attr_id], ascending=1) # sorting via column
df = df.reset_index(drop=True)
data_arr = df[attr_id]
label_arr = df[df.columns[-1]]
for i in range(n-1):
div = (data_arr[i] + data_arr[i+1]) / 2
sub_info_ent[div] = ( (i+1) * InfoEnt(label_arr[0:i+1]) / n ) \
+ ( (n-i-1) * InfoEnt(label_arr[i+1:-1]) / n )
# our goal is to get the min subset entropy sum and it's divide value
div_value, sub_info_ent_max = min(sub_info_ent.items(), key=lambda x: x[1])
info_gain -= sub_info_ent_max
# 2.for discrete variable (categoric variable)
else:
data_arr = df[attr_id]
label_arr = df[df.columns[-1]]
value_count = ValueCount(data_arr)
for key in value_count:
key_label_arr = label_arr[data_arr == key]
info_gain -= value_count[key] * InfoEnt(key_label_arr) / n
return info_gain, div_value
#计算属性的信息熵
def InfoEnt(label_arr):
'''
calculating the information entropy of an attribution
@param label_arr: ndarray, class label array of data_arr
@return ent: the information entropy of current attribution
'''
try :
from math import log2
except ImportError :
print("module math.log2 not found")
ent = 0
n = len(label_arr)
label_count = NodeLabel(label_arr)
for key in label_count:
ent -= ( label_count[key] / n ) * log2( label_count[key] / n )
return ent
### (4)基尼指数和基尼值的计算
#挑选基尼指数最小的属性
def OptAttr_Gini(df):
'''
find the optimal attributes of current data_set based on gini index
@param df: the pandas dataframe of the data_set
@return opt_attr: the optimal attribution for branch
@return div_value: for discrete variable value = 0
for continuous variable value = t for bisection divide value
'''
gini_index = float('Inf')
for attr_id in df.columns[1:-1]: #有几个属性就循环几次
gini_index_tmp, div_value_tmp = GiniIndex(df, attr_id) #计算基尼值 和连续变量的划分值
if gini_index_tmp < gini_index : # 比较并找到最小的基尼指数所对应的属性
gini_index = gini_index_tmp
opt_attr = attr_id
div_value = div_value_tmp
return opt_attr, div_value
#计算基尼指数
def GiniIndex(df, attr_id):
'''
calculating the gini index of an attribution
@param df: dataframe, the pandas dataframe of the data_set
@param attr_id: the target attribution in df
@return gini_index: the gini index of current attribution
@return div_value: for discrete variable, value = 0
for continuous variable, value = t (the division value)
'''
gini_index = 0 # info_gain for the whole label
div_value = 0 # 连续变量的划分值
n = len(df[attr_id]) # 样本数量
# 1.对于连续变量,使用分切的方法
if df[attr_id].dtype == float:
sub_gini = {} # 存储div_value(div)和它的子集gini值
df = df.sort_values([attr_id], ascending=1) # sorting via column
df = df.reset_index(drop=True)
data_arr = df[attr_id]
label_arr = df[df.columns[-1]]
for i in range(n-1):
div = (data_arr[i] + data_arr[i+1]) / 2
sub_gini[div] = ( (i+1) * Gini(label_arr[0:i+1]) / n ) \
+ ( (n-i-1) * Gini(label_arr[i+1:-1]) / n )
# 我们的目标是得到最小子集熵的总和和它对应的划分值
div_value, gini_index = min(sub_gini.items(), key=lambda x: x[1])
# 2.对离散变量(分类变量)而言,公式计算基尼指数。
else:
data_arr = df[attr_id]
label_arr = df[df.columns[-1]]
value_count = ValueCount(data_arr)
for key in value_count:
key_label_arr = label_arr[data_arr == key]
gini_index += value_count[key] * Gini(key_label_arr) / n
return gini_index, div_value
#计算该属性的基尼值
def Gini(label_arr):
'''
calculating the gini value of an attribution
@param label_arr: ndarray, class label array of data_arr
@return gini: the information entropy of current attribution
'''
gini = 1
n = len(label_arr)
label_count = NodeLabel(label_arr) #该属性共有几种取值
for key in label_count:
gini -= ( label_count[key] / n ) * ( label_count[key] / n )
return gini
### (5)带预剪枝的树的构建
# 预剪枝函数
import pandas as pd
data_file_encode = "gb18030"
with open("watermelon_2.csv", mode='r', encoding=data_file_encode) as data_file:
data = pd.read_csv(data_file)
def PrePurn(df_train, df_test, a0 = 0):
'''
pre-purning to generating a decision tree
@param df_train: dataframe, the training set to generating a tree
@param df_test: dataframe, the testing set for purning decision
@return root: Node, root of the tree using purning
'''
# generating a new root node
new_node = Node(None, None, {})
label_arr = df_train[df_train.columns[-1]]
label_count = NodeLabel(label_arr)
if label_count: # assert the label_count isn's empty
new_node.label = max(label_count, key=label_count.get)
# end if there is only 1 class in current node data
# end if attribution array is empty
if len(label_count) == 1 or len(label_arr) == 0:
return new_node
# calculating the test accuracy up to current node
if a0==0:
a0 = PredictAccuracy(new_node, df_test) # 在测试集test上计算不进行分支的模型的精度并将其赋值给a0
# get the optimal attribution for a new branching
new_node.attr, div_value = OptAttr_Ent(df_train)
# get the new branch
if div_value == 0: # categoric variable
value_count = ValueCount(data[new_node.attr])
for value in value_count:
df_v = df_train[df_train[new_node.attr].isin([value])]
df_v = df_v.drop(new_node.attr, 1)
# 先进行分支,得到分支后的模型(目的是为了计算分支后的模型精度)
new_node_child = Node(None, None, {})
label_arr_child = df_v[df_v.columns[-1]]
if len(df_v) == 0:
new_node.attr_down[value] = Node(None,new_node.label, {})
else:
label_count_child = NodeLabel(label_arr_child)
new_node_child.label = max(label_count_child, key=label_count_child.get)
new_node.attr_down[value] = new_node_child
# calculating to check whether need further branching
a1 = PredictAccuracy(new_node, df_test) # 在测试集test上计算进行分支的模型的精度并将其赋值给a1
if a1 > a0: # need branching #判断两者精度大小,如果分支后的模型精度更大则迭代生成子树
for value in value_count:
df_v = df_train[df_train[new_node.attr].isin([value])]
df_v = df_v.drop(new_node.attr, 1)
new_node.attr_down[value] = PrePurn(df_v,df_test,a1)
else: # 若不分枝的精度大,则不生成子树,并将该节点作为叶子节点(即剪枝)
new_node.attr = None
new_node.attr_down = {}
else: # continuous variable # left and right child
value_l = "<=%.3f" % div_value
value_r = ">%.3f" % div_value
df_v_l = df_train[df_train[new_node.attr] <= div_value] # get sub set
df_v_r = df_train[df_train[new_node.attr] > div_value]
# for child node
new_node_l = Node(None, None, {})
new_node_r = Node(None, None, {})
label_count_l = NodeLabel(df_v_l[df_v_r.columns[-1]])
label_count_r = NodeLabel(df_v_r[df_v_r.columns[-1]])
# 先进行分支,得到分支后的模型,用于计算精度以比较
new_node_l.label = max(label_count_l, key=label_count_l.get)
new_node_r.label = max(label_count_r, key=label_count_r.get)
new_node.attr_down[value_l] = new_node_l
new_node.attr_down[value_r] = new_node_r
# calculating to check whether need further branching
a1 = PredictAccuracy(new_node, df_test) # 计算分支后的模型精度比较并决定是否分支
if a1 > a0: # need branching
new_node.attr_down[value_l] = PrePurn(df_v_l,df_test,a1)
new_node.attr_down[value_r] = PrePurn(df_v_r,df_test,a1)
else:
new_node.attr = None
new_node.attr_down = {}
return new_node
### (6)带后剪枝的树构建函数
#后剪枝函数
import pandas as pd
data_file_encode = "gb18030"
with open("watermelon_2.csv", mode = 'r', encoding = data_file_encode) as data_file:
data = pd.read_csv(data_file)
def PostPurn(root, df_test):
'''
预先 生成完整的决策树
@param root:节点,树的根
@param df_test:数据框,用于判断是否进行剪枝的测试集
@return 通过遍历树的准确度得分
'''
# 叶节点的预测精度
if root.attr == None:
return PredictAccuracy(root, df_test)
# 计算子节点的测试精度
a1 = 0
value_count = ValueCount(data[root.attr])
for value in list(value_count):
df_test_v = df_test[ df_test[root.attr].isin([value]) ] # 获取子集
if len(df_test_v) == 0:
continue
if value in root.attr_down: # 当前节点存在属性
a1_v = PostPurn(root.attr_down[value], df_test_v) # 按属性进行划分子数据集和新的根节点递归
else: # 当前节点不存在属性
a1_v = PredictAccuracy(root, df_test_v) # 计算精度
if a1_v == -1: # -1 表示该节点不需要剪枝
return -1 # 该节点上一节点也不需要剪枝
else:
a1 += a1_v * len(df_test_v.index) / len(df_test.index) #对各子数据集按属性划分后各类的准确率进行求和
# 计算剪枝后节点上的测试准确度
node = Node(None, root.label, {})
a0 = PredictAccuracy(node, df_test)
# 检查是否需要修剪
if a0 > a1:
root.attr = None
root.attr_down = {}
return a0 # 返回该节点剪枝后测试集的精度
else:
return -1 # 该节点不需要剪枝
## 3.决策树可视化
#可视化并保存为PNG
def DrawPNG(root, out_file):
'''
visualization of decision tree from root.
@param root: Node, the root node for tree.
@param out_file: str, name and path of output file
'''
try:
from pydotplus import graphviz
except ImportError:
print("module pydotplus.graphviz not found")
g = graphviz.Dot() # generation of new dot
TreeToGraph(0, g, root)
g2 = graphviz.graph_from_dot_data( g.to_string() )
g2.write_png(out_file)
#将结构类型转化为图
def TreeToGraph(i, g, root):
'''
build a graph from root on
@param i: node number in this tree
@param g: pydotplus.graphviz.Dot() object
@param root: the root node
@return i: node number after modified
# @return g: pydotplus.graphviz.Dot() object after modified
@return g_node: the current root node in graphviz
'''
try:
from pydotplus import graphviz
except ImportError:
print("module pydotplus.graphviz not found")
if root.attr == None:
g_node_label = "Node:%d\n好瓜:%s" % (i, root.label)
else:
g_node_label = "Node:%d\n好瓜:%s\n属性:%s" % (i, root.label, root.attr)
g_node = i
g.add_node( graphviz.Node( g_node, label = g_node_label, fontname="FangSong" ) )
for value in list(root.attr_down):
i, g_child = TreeToGraph(i+1, g, root.attr_down[value])
g.add_edge( graphviz.Edge(g_node, g_child, label = value, fontname="FangSong") )
return i, g_node
# 二、由数据创建树
# -*- coding: utf-8 -*
'''''
create on 2017/3/24, the day after our national football team beat south korea
@author: PY131
'''''
'''
import data and pre-analysis through data visualization
'''
# using pandas dataframe for .csv read which contains chinese char.
import pandas as pd
data_file_encode = "gb18030"
with open("watermelon_2.csv", mode = 'r', encoding = data_file_encode) as data_file:
df = pd.read_csv(data_file)
# 训练集划分
index_train = [0,1,2,5,6,9,13,14,15,16]
df_train = df.iloc[index_train]
df_test = df.drop(index_train)
# 生成一棵完整的树
root = TreeGenerate(df_train)
DrawPNG(root, "decision_tree_full.png")
print("accuracy of full tree: %.3f" % PredictAccuracy(root, df_test))
# 预剪枝
root = PrePurn(df_train, df_test)
DrawPNG(root, "decision_tree_pre.png")
print("accuracy of pre-purning tree: %.3f" % PredictAccuracy(root, df_test))
# 后剪枝
root = TreeGenerate(df_train)
PostPurn(root, df_test)
DrawPNG(root, "decision_tree_post.png")
print("accuracy of post-purning tree: %.3f" % PredictAccuracy(root, df_test))
## 与ID-3代码一致
# print the accuracy
# k-folds cross prediction
accuracy_scores = []
n = len(df.index)
k = 5
for i in range(k):
m = int(n/k)
test = []
for j in range(i*m, i*m+m):
test.append(j)
df_train = df.drop(test)
df_test = df.iloc[test]
root = TreeGenerate(df_train) # generate the tree
PostPurn(root, df_test) # post-purning
# test the accuracy
pred_true = 0
for i in df_test.index:
label = Predict(root, df[df.index == i])
if label == df_test[df_test.columns[-1]][i]:
pred_true += 1
accuracy = pred_true / len(df_test.index)
accuracy_scores.append(accuracy)
# print the prediction accuracy result
accuracy_sum = 0
print("accuracy: ", end = "")
for i in range(k):
print("%.3f " % accuracy_scores[i], end = "")
accuracy_sum += accuracy_scores[i]
print("\naverage accuracy: %.3f" % (accuracy_sum/k))
注:本代码给出了两种决策树的两种构建函数(基尼指数(Cart决策树),信息增益 (ID4决策树))。但是本代码构建时实际采用的时基尼指数。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)