原理参考:
决策树:原理以及python实现
python实现:
import collections
import numpy as np
class ID3:
def __init__(self, x, y, labels):
self.x = x
self.y = y
self.data = np.hstack((self.x, self.y))
self.labels = labels
self.tree = {}
def calEntropy(self, data):
'''
计算信息熵
:param data:列表等序列
:return:输入数据的信息熵
'''
entropy = 0
c = collections.Counter(np.array(data).ravel())
total = len(data)
for i in c.values():
entropy -= i/total * np.log(i/total)
return entropy
def splitdata(self, data, col, value):
'''
:param data:
:param col:待划分特征列的数字索引
:param value:
:return:返回输入data的col列等于value的去掉col列的矩阵
'''
data_r = data[data[:, col] == value]
data_r = np.hstack((data_r[:, :col], data_r[:, col+1:]))
#print(data, col,value,data_r)
return data_r
def getBestFeature(self, data):
'''
:param data: 形式为[x y]的矩阵
:return: 最优特征所在列索引
'''
entropy_list = []
numberAll = data.shape[0]
#print('data',data)
for col in range(data.shape[1]-1):
entropy_splited = 0
#print(data[:, col], np.unique(data[:, col]))
for value in np.unique(data[:, col]):
y_splited = self.splitdata(data, col, value)[:, -1]
#print(col,value,data)
entropy = self.calEntropy(y_splited)
entropy_splited += len(y_splited)/numberAll*entropy
entropy_list.append(entropy_splited)
#print(entropy_list)
return entropy_list.index(min(entropy_list))
def CreateTree(self, data, label):
'''
:param data: 形如[x y]的矩阵
:param feature_label:
:return: 决策树字典
'''
#print(np.unique(data[:, -1]))
feature_label = label.copy()
if len(np.unique(data[:, -1])) == 1:
#print(data[0, -1])
return data[0, -1]
if data.shape[1] == 1:
print(collections.Counter(data[:, -1]).most_common()[0][0])
return collections.Counter(data[:, -1]).most_common()[0][0]
bestFeature = self.getBestFeature(data)
bestFeatureLabel = feature_label[bestFeature]
#print(bestFeature, bestFeatureLabel)
treeDict = {bestFeatureLabel: {}}
del feature_label[bestFeature]
#print(np.unique(data[:, bestFeature]))
for value in np.unique(data[:, bestFeature]):
sub_labels = feature_label[:]
#print(feature_label,sub_labels)
splited_data = self.splitdata(data, bestFeature, value)
#print(splited_data)
treeDict[bestFeatureLabel][value] = self.CreateTree(splited_data, sub_labels)
print(treeDict)
return treeDict
def fit(self):
self.tree = self.CreateTree(self.data, self.labels)
def predict_vec(self, vec, input_tree=None):
if input_tree==None:
input_tree = self.tree
featureIndex = self.labels.index(list(input_tree.keys())[0])
#print(list(input_tree.keys()),featureIndex)
secTree = list(input_tree.values())[0]
#print(list(input_tree.values())[0])
# #print(vec[featureIndex])
vec_feature_val = vec[featureIndex]
#print(vec_feature_val, secTree, secTree.get(vec_feature_val))
if type(secTree.get(vec_feature_val)) != dict:
#print(secTree.get(vec_feature_val))
return secTree.get(vec_feature_val)
else:
#print(secTree.get(vec_feature_val))
return self.predict_vec(vec, secTree.get(vec_feature_val))
def predict(self, x):
out_put=[]
for i in x:
out_put.append(self.predict_vec(i))
return out_put
if __name__ == '__main__':
x = np.array([[1, 1],[1, 1],[1, 0],[0, 1],[0, 1]])
y = np.array([[1], [1], [0], [0], [0]])
labels = [0, 1]
id3 = ID3(x, y, labels)
id3.fit()
#print(id3.tree)
print(id3.predict([[1, 0]]))
python调包:
import numpy as np
from sklearn import tree
x = np.array([[1, 1],[1, 1],[1, 0],[0, 1],[0, 1]])
y = np.array([[1], [1], [0], [0], [0]])
clf = tree.DecisionTreeRegressor(max_depth=4)
clf.fit(x, y)
print('预测值为:', clf.predict([[1, 0]]))
C++实现:
#include
#include
#include
#include
#include
void printMat(std::vector<std::vector<float>> mat)
{
for (size_t i = 0; i < mat.size(); i++)
{
for (size_t j = 0; j < mat[0].size(); j++)
{
std::cout << mat[i][j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
struct TreeNode
{
float label;
float value;
std::vector<TreeNode*> children;
bool isleaf = false;
};
class ID3
{
public:
ID3(std::vector<std::vector<float>> x, std::vector<float> y, std::vector<float> labels)
{
m_x = x;
m_y = y;
m_labels = labels;
m_data = x;
for (size_t i = 0; i < x.size(); i++)
{
m_data[i].push_back(y[i]);
}
}
float calEntropy(std::vector<float> data)
{
float entropy = 0;
std::map<float, int> c;
for (auto i:data) ++c[i];
int total = data.size();
for (std::map<float, int>::iterator it = c.begin(); it != c.end(); it++)
{
entropy -= it->second / total*log(it->second / total);
}
return entropy;
}
std::vector<std::vector<float>> splitdata(std::vector<std::vector<float>> data, int col, float value)
{
std::vector<int> index;
for (size_t i = 0; i < data.size(); i++)
{
if (data[i][col] == value) index.push_back(i);
}
std::vector<std::vector<float>> data_r(index.size());
for (size_t i = 0; i < index.size(); i++)
{
std::vector<float> vec;
for (size_t j = 0; j < data[0].size(); j++)
{
if (j != col) vec.push_back(data[index[i]][j]);
}
data_r[i] = vec;
}
return data_r;
}
int getBestFeature(std::vector<std::vector<float>> data)
{
std::vector<float> entropy_list;
int numberAll = data.size();
for (size_t col = 0; col < data[0].size()-1; col++)
{
float entropy_splited = 0;
std::set<float> s;
for (size_t i = 0; i < data.size(); i++)
{
s.insert(data[i][col]);
}
for (auto value : s)
{
//printMat(data);
//std::cout << col << " " << value << std::endl;
std::vector<std::vector<float>> splited = splitdata(data, col, value);
//printMat(splited);
std::vector<float> y_splited(splited.size());
for (size_t i = 0; i < splited.size(); i++)
{
y_splited[i] = splited[i][splited[0].size() - 1];
}
float entropy = calEntropy(y_splited);
entropy_splited += y_splited.size() / numberAll*entropy;
}
entropy_list.push_back(entropy_splited);
}
int min_index = 0;
float min_value = INT_MAX;
for (size_t i = 0; i < entropy_list.size(); i++)
{
if (entropy_list[i] < min_value)
{
entropy_list[i] = min_value;
min_index = i;
}
}
return min_index;
}
TreeNode* CreateTree(std::vector<std::vector<float>> data, std::vector<float> label)
{
std::vector<float> feature_label = label;
std::set<float> s1;
for (size_t i = 0; i < data.size(); i++)
{
s1.insert(data[i][data[0].size() - 1]);
}
//for (auto i : s1) std::cout << i << " "; std::cout << std::endl;
if (s1.size() == 1)
{
TreeNode* node = new TreeNode;
node->isleaf = true;
node->label = data[0][data[0].size() - 1];
//std::cout << node->label << std::endl;
return node;
}
if (data[0].size() == 1)
{
std::map<float, int> c;
for (auto i : data[data[0].size() - 1]) ++c[i];
std::vector<std::pair<float, int>> c_vec;
for (std::map<float, int>::iterator it = c.begin(); it != c.end(); it++)
{
c_vec.push_back(std::make_pair((*it).first, (*it).second));
}
std::sort(c_vec.begin(), c_vec.end(), [](std::pair<float, int> p1, std::pair<float, int> p2) {return p1.second < p2.second; });
TreeNode* node = new TreeNode;
node->isleaf = true;
node->label = c_vec.rbegin()->first;
return node;
}
int bestFeature = getBestFeature(data);
float bestFeatureLabel = feature_label[bestFeature];
//std::cout << bestFeature << " " << bestFeatureLabel << std::endl;
TreeNode* node = new TreeNode;
node->label = bestFeatureLabel;
feature_label.erase(feature_label.begin() + bestFeature);
std::set<float> s2;
for (size_t i = 0; i < data.size(); i++)
{
s2.insert(data[i][bestFeatureLabel]);
}
//for (auto i : s2) std::cout << i << " "; std::cout << std::endl;
for (auto value : s2)
{
std::vector<float> sub_labels = feature_label;
//for (auto i : sub_labels) std::cout << i << " "; std::cout << std::endl;
std::vector<std::vector<float>> splited_data = splitdata(data, bestFeature, value);
//printMat(splited_data);
TreeNode* childnode = CreateTree(splited_data, sub_labels);
childnode->value = value;
node->children.push_back(childnode);
}
//std::cout << node->children.size() << std::endl;
return node;
}
void fit()
{
m_tree = CreateTree(m_data, m_labels);
}
TreeNode* predict_vec(std::vector<float> vec, TreeNode* input_tree = nullptr)
{
if (input_tree == nullptr)
input_tree = m_tree;
int featureIndex;
for (size_t i = 0; i < m_labels.size(); i++)
{
if (m_labels[i] == input_tree->label)
{
featureIndex = i;
break;
}
}
//std::cout << input_tree->label << " " << featureIndex << std::endl;
std::vector<TreeNode*> secTree = (input_tree->children);
//std::cout << input_tree->children.size() << std::endl;
float vec_feature_val = vec[featureIndex];
//std::cout << vec_feature_val << std::endl;
bool isdict;
size_t i = 0;
for (; i < secTree.size(); i++)
{
if (secTree[i]->value == vec_feature_val)
{
isdict = !(secTree[i]->isleaf);
break;
}
}
if (!isdict)
return secTree[i];
else
return predict_vec(vec, secTree[i]);
}
std::vector<float> predict(std::vector<std::vector<float>> x)
{
std::vector<float> out_put;
for (auto i : x)
out_put.push_back(predict_vec(i)->label);
return out_put;
}
private:
std::vector<std::vector<float>> m_x;
std::vector<float> m_y;
std::vector<std::vector<float>> m_data;
std::vector<float> m_labels;
public:
TreeNode* m_tree;
};
int main(int argc, char* argv[])
{
std::vector<std::vector<float>> x = { { 1, 1 },{ 1, 1 },{ 1, 0 },{ 0, 1 },{ 0, 1 } };
std::vector<float> y = { { 1 },{ 1 },{ 0 },{ 0 },{ 0 } };
std::vector<float> labels = { 0, 1 };
ID3 id3 = ID3(x, y, labels);
id3.fit();
std::cout << "预测值为:" << id3.predict({ { 1, 0 } })[0] << std::endl;
system("pause");
return EXIT_SUCCESS;
}
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)