% Classify using Quinlan's C4.5 algorithm
% Inputs:
% training_patterns - Train patterns 模式
% training_targets - Train targets 目标
% test_patterns - Test patterns 测试模式
% inc_node- Percentage of incorrectly assigned samples at a node
% 一个节点中不正确的样本分配比例
% Outputs
% test_targets- Predicted targets预测目标
%NOTE: In this implementation it is assumed that a pattern vector with fewer than 10 unique values (the parameter Nu)
%is discrete, and will be treated as such. Other vectors will be treated as continuous
[Ni, M] = size(train_patterns)
inc_node= inc_node*M/100
%Find which of the input patterns are discrete, and discretisize the corresponding
%dimension on the test patterns
discrete_dim = zeros(1,Ni)
for i = 1:Ni,
Ub = unique(train_patterns(i,:))%获得第i行中不重复的元素构成的向量
Nb = length(Ub)
if (Nb <= Nu),
%This is a discrete pattern 这是一个离散的模式
discrete_dim(i) = Nb
dist= abs(ones(Nb ,1)*test_patterns(i,:) - Ub'*ones(1, size(test_patterns,2)))
[m, in] = min(dist)
test_patterns(i,:) = Ub(in)
end
end
%Build the tree recursively
%disp('Building tree')
tree= make_tree(train_patterns, train_targets, inc_node, discrete_dim, max(discrete_dim), 0)
%Classify test samples
%disp('Classify test samples using the tree')
test_targets= use_tree(test_patterns, 1:size(test_patterns,2), tree, discrete_dim, unique(train_targets))
%END
function targets = use_tree(patterns, indices, tree, discrete_dim, Uc)
%Classify recursively using a tree
targets = zeros(1, size(patterns,2))
if (tree.dim == 0)
%Reached the end of the tree
targets(indices) = tree.child
return
end
%This is not the last level of the tree, so:
%First, find the dimension we are to work on
dim = tree.dim
dims= 1:size(patterns,1)
%And classify according to it
if (discrete_dim(dim) == 0),
%Continuous pattern
in = indices(find(patterns(dim, indices) <= tree.split_loc))
targets = targets + use_tree(patterns(dims, :), in, tree.child(1), discrete_dim(dims), Uc)
in = indices(find(patterns(dim, indices) > tree.split_loc))
targets = targets + use_tree(patterns(dims, :), in, tree.child(2), discrete_dim(dims), Uc)
else
%Discrete pattern
Uf = unique(patterns(dim,:))
for i = 1:length(Uf),
if any(Uf(i) == tree.Nf) %Has this sort of data appeared before? If not, do nothing
in = indices(find(patterns(dim, indices) == Uf(i)))
targets = targets + use_tree(patterns(dims, :), in, tree.child(find(Uf(i)==tree.Nf)), discrete_dim(dims), Uc)
end
end
end
%END use_tree
function tree = make_tree(patterns, targets, inc_node, discrete_dim, maxNbin, base)
%Build a tree recursively
[Ni, L] = size(patterns)
Uc = unique(targets)
tree.dim = 0
%tree.child(1:maxNbin) = zeros(1,maxNbin)
tree.split_loc = inf
if isempty(patterns),
return
end
%When to stop: If the dimension is one or the number of examples is small
if ((inc_node >L) | (L == 1) | (length(Uc) == 1)),
H = hist(targets, length(Uc))
[m, largest] = max(H)
tree.Nf = []
tree.split_loc = []
tree.child = Uc(largest)
return
end
%Compute the node's I
for i = 1:length(Uc),
Pnode(i) = length(find(targets == Uc(i))) / L
end
Inode = -sum(Pnode.*log(Pnode)/log(2))
%For each dimension, compute the gain ratio impurity
%This is done separately for discrete and continuous patterns
delta_Ib= zeros(1, Ni)
split_loc = ones(1, Ni)*inf
for i = 1:Ni,
data = patterns(i,:)
Ud = unique(data)
Nbins = length(Ud)
if (discrete_dim(i)),
%This is a discrete pattern
P = zeros(length(Uc), Nbins)
for j = 1:length(Uc),
for k = 1:Nbins,
indices = find((targets == Uc(j)) &(patterns(i,:) == Ud(k)))
P(j,k) = length(indices)
end
end
Pk = sum(P)
P = P/L
Pk = Pk/sum(Pk)
info= sum(-P.*log(eps+P)/log(2))
delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2))
else
%This is a continuous pattern
P = zeros(length(Uc), 2)
%Sort the patterns
[sorted_data, indices] = sort(data)
sorted_targets = targets(indices)
%Calculate the information for each possible split
I = zeros(1, L-1)
for j = 1:L-1,
%for k =1:length(Uc),
%P(k,1) = sum(sorted_targets(1:j)== Uc(k))
%P(k,2) = sum(sorted_targets(j+1:end)== Uc(k))
%end
P(:, 1) = hist(sorted_targets(1:j) , Uc)
P(:, 2) = hist(sorted_targets(j+1:end) , Uc)
Ps = sum(P)/L
P = P/L
Pk = sum(P)
P1 = repmat(Pk, length(Uc), 1)
P1 = P1 + eps*(P1==0)
info = sum(-P.*log(eps+P./P1)/log(2))
I(j) = Inode - sum(info.*Ps)
end
[delta_Ib(i), s] = max(I)
split_loc(i) = sorted_data(s)
end
end
%Find the dimension minimizing delta_Ib
[m, dim]= max(delta_Ib)
dims= 1:Ni
tree.dim= dim
%Split along the 'dim' dimension
Nf = unique(patterns(dim,:))
Nbins = length(Nf)
tree.Nf = Nf
tree.split_loc = split_loc(dim)
%If only one value remains for this pattern, one cannot split it.
if (Nbins == 1)
H = hist(targets, length(Uc))
[m, largest] = max(H)
tree.Nf = []
tree.split_loc = []
tree.child = Uc(largest)
return
end
if (discrete_dim(dim)),
%Discrete pattern
for i = 1:Nbins,
indices = find(patterns(dim, :) == Nf(i))
tree.child(i) = make_tree(patterns(dims, indices), targets(indices), inc_node, discrete_dim(dims), maxNbin, base)
end
else
%Continuous pattern
indices1 = find(patterns(dim,:) <= split_loc(dim))
indices2 = find(patterns(dim,:) >split_loc(dim))
if ~(isempty(indices1) | isempty(indices2))
tree.child(1) = make_tree(patterns(dims, indices1), targets(indices1), inc_node, discrete_dim(dims), maxNbin, base+1)
tree.child(2) = make_tree(patterns(dims, indices2), targets(indices2), inc_node, discrete_dim(dims), maxNbin, base+1)
else
H = hist(targets, length(Uc))
[m, largest] = max(H)
tree.child = Uc(largest)
tree.dim= 0
end
end
这是该题的程序,请给出调用举例~~~急~
function test_targets = C4_5(train_patterns, train_targets, test_patterns, inc_node, Nu)% Classify using Quinlan's C4.5 algorithm
% Inputs:
% training_patterns - Train patterns 模式
% training_targets - Train targets 目标
% test_patterns - Test patterns 测试模式
% inc_node- Percentage of incorrectly assigned samples at a node
% 一个节点中不正确的样本分配比例
% Outputs
% test_targets- Predicted targets预测目标
%NOTE: In this implementation it is assumed that a pattern vector with fewer than 10 unique values (the parameter Nu)
%is discrete, and will be treated as such. Other vectors will be treated as continuous
[Ni, M] = size(train_patterns)
inc_node= inc_node*M/100
%Find which of the input patterns are discrete, and discretisize the corresponding
%dimension on the test patterns
discrete_dim = zeros(1,Ni)
for i = 1:Ni,
Ub = unique(train_patterns(i,:))%获得第i行中不重复的元素构成的向量
Nb = length(Ub)
if (Nb <= Nu),
%This is a discrete pattern 这是一个离散的模式
discrete_dim(i) = Nb
dist= abs(ones(Nb ,1)*test_patterns(i,:) - Ub'*ones(1, size(test_patterns,2)))
[m, in] = min(dist)
test_patterns(i,:) = Ub(in)
end
end
%Build the tree recursively
%disp('Building tree')
tree= make_tree(train_patterns, train_targets, inc_node, discrete_dim, max(discrete_dim), 0)
%Classify test samples
%disp('Classify test samples using the tree')
test_targets= use_tree(test_patterns, 1:size(test_patterns,2), tree, discrete_dim, unique(train_targets))
%END
function targets = use_tree(patterns, indices, tree, discrete_dim, Uc)
%Classify recursively using a tree
targets = zeros(1, size(patterns,2))
if (tree.dim == 0)
%Reached the end of the tree
targets(indices) = tree.child
return
end
%This is not the last level of the tree, so:
%First, find the dimension we are to work on
dim = tree.dim
dims= 1:size(patterns,1)
%And classify according to it
if (discrete_dim(dim) == 0),
%Continuous pattern
in = indices(find(patterns(dim, indices) <= tree.split_loc))
targets = targets + use_tree(patterns(dims, :), in, tree.child(1), discrete_dim(dims), Uc)
in = indices(find(patterns(dim, indices) > tree.split_loc))
targets = targets + use_tree(patterns(dims, :), in, tree.child(2), discrete_dim(dims), Uc)
else
%Discrete pattern
Uf = unique(patterns(dim,:))
for i = 1:length(Uf),
if any(Uf(i) == tree.Nf) %Has this sort of data appeared before? If not, do nothing
in = indices(find(patterns(dim, indices) == Uf(i)))
targets = targets + use_tree(patterns(dims, :), in, tree.child(find(Uf(i)==tree.Nf)), discrete_dim(dims), Uc)
end
end
end
%END use_tree
function tree = make_tree(patterns, targets, inc_node, discrete_dim, maxNbin, base)
%Build a tree recursively
[Ni, L] = size(patterns)
Uc = unique(targets)
tree.dim = 0
%tree.child(1:maxNbin) = zeros(1,maxNbin)
tree.split_loc = inf
if isempty(patterns),
return
end
%When to stop: If the dimension is one or the number of examples is small
if ((inc_node >L) | (L == 1) | (length(Uc) == 1)),
H = hist(targets, length(Uc))
[m, largest] = max(H)
tree.Nf = []
tree.split_loc = []
tree.child = Uc(largest)
return
end
%Compute the node's I
for i = 1:length(Uc),
Pnode(i) = length(find(targets == Uc(i))) / L
end
Inode = -sum(Pnode.*log(Pnode)/log(2))
%For each dimension, compute the gain ratio impurity
%This is done separately for discrete and continuous patterns
delta_Ib= zeros(1, Ni)
split_loc = ones(1, Ni)*inf
for i = 1:Ni,
data = patterns(i,:)
Ud = unique(data)
Nbins = length(Ud)
if (discrete_dim(i)),
%This is a discrete pattern
P = zeros(length(Uc), Nbins)
for j = 1:length(Uc),
for k = 1:Nbins,
indices = find((targets == Uc(j)) &(patterns(i,:) == Ud(k)))
P(j,k) = length(indices)
end
end
Pk = sum(P)
P = P/L
Pk = Pk/sum(Pk)
info= sum(-P.*log(eps+P)/log(2))
delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2))
else
%This is a continuous pattern
P = zeros(length(Uc), 2)
%Sort the patterns
[sorted_data, indices] = sort(data)
sorted_targets = targets(indices)
%Calculate the information for each possible split
I = zeros(1, L-1)
for j = 1:L-1,
%for k =1:length(Uc),
%P(k,1) = sum(sorted_targets(1:j)== Uc(k))
%P(k,2) = sum(sorted_targets(j+1:end)== Uc(k))
%end
P(:, 1) = hist(sorted_targets(1:j) , Uc)
P(:, 2) = hist(sorted_targets(j+1:end) , Uc)
Ps = sum(P)/L
P = P/L
Pk = sum(P)
P1 = repmat(Pk, length(Uc), 1)
P1 = P1 + eps*(P1==0)
info = sum(-P.*log(eps+P./P1)/log(2))
I(j) = Inode - sum(info.*Ps)
end
[delta_Ib(i), s] = max(I)
split_loc(i) = sorted_data(s)
end
end
%Find the dimension minimizing delta_Ib
[m, dim]= max(delta_Ib)
dims= 1:Ni
tree.dim= dim
%Split along the 'dim' dimension
Nf = unique(patterns(dim,:))
Nbins = length(Nf)
tree.Nf = Nf
tree.split_loc = split_loc(dim)
%If only one value remains for this pattern, one cannot split it.
if (Nbins == 1)
H = hist(targets, length(Uc))
[m, largest] = max(H)
tree.Nf = []
tree.split_loc = []
tree.child = Uc(largest)
return
end
if (discrete_dim(dim)),
%Discrete pattern
for i = 1:Nbins,
indices = find(patterns(dim, :) == Nf(i))
tree.child(i) = make_tree(patterns(dims, indices), targets(indices), inc_node, discrete_dim(dims), maxNbin, base)
end
else
%Continuous pattern
indices1 = find(patterns(dim,:) <= split_loc(dim))
indices2 = find(patterns(dim,:) >split_loc(dim))
if ~(isempty(indices1) | isempty(indices2))
tree.child(1) = make_tree(patterns(dims, indices1), targets(indices1), inc_node, discrete_dim(dims), maxNbin, base+1)
tree.child(2) = make_tree(patterns(dims, indices2), targets(indices2), inc_node, discrete_dim(dims), maxNbin, base+1)
else
H = hist(targets, length(Uc))
[m, largest] = max(H)
tree.child = Uc(largest)
tree.dim= 0
end
end
%以下是决策树C4.5的程序,不过我也看不懂,如果你看得懂的话互相交流~function D = C4_5(train_features, train_targets, inc_node, region)
% Classify using Quinlan's C4.5 algorithm% Inputs:
% features - Train features
% targets - Train targets
% inc_node- Percentage of incorrectly assigned samples at a node
% region - Decision region vector: [-x x -y y number_of_points]
%
% Outputs
% D - Decision sufrace
%NOTE: In this implementation it is assumed that a feature vector with fewer than 10 unique values (the parameter Nu)%is discrete, and will be treated as such. Other vectors will be treated as continuous
[Ni, M] = size(train_features)inc_node= inc_node*M/100
Nu = 10
%For the decision regionN = region(5)
mx = ones(N,1) * linspace (region(1),region(2),N)
my = linspace (region(3),region(4),N)' * ones(1,N)
flatxy = [mx(:), my(:)]'
%Preprocessing%[f, t, UW, m] = PCA(train_features, train_targets, Ni, region)
%train_features = UW * (train_features - m*ones(1,M))
%flatxy = UW * (flatxy - m*ones(1,N^2))
%Find which of the input features are discrete, and discretisize the corresponding%dimension on the decision region
discrete_dim = zeros(1,Ni)
for i = 1:Ni,
Nb = length(unique(train_features(i,:)))
if (Nb <= Nu),
%This is a discrete feature
discrete_dim(i) = Nb
[H, flatxy(i,:)] = high_histogram(flatxy(i,:), Nb)
end
end
%Build the tree recursivelydisp('Building tree')
tree= make_tree(train_features, train_targets, inc_node, discrete_dim, max(discrete_dim), 0)
%Make the decision region according to the treedisp('Building decision surface using the tree')
targets = use_tree(flatxy, 1:N^2, tree, discrete_dim, unique(train_targets))
D = reshape(targets,N,N)%END
function targets = use_tree(features, indices, tree, discrete_dim, Uc)%Classify recursively using a tree
targets = zeros(1, size(features,2))
if (tree.dim == 0) %Reached the end of the tree
targets(indices) = tree.child
%break
end
%This is not the last level of the tree, so:
%First, find the dimension we are to work on
dim = tree.dim
dims= 1:size(features,1)
%And classify according to itif (discrete_dim(dim) == 0),
%Continuous feature
in= indices(find(features(dim, indices) <= tree.split_loc))
targets = targets + use_tree(features(dims, :), in, tree.child(1), discrete_dim(dims), Uc)
in= indices(find(features(dim, indices) > tree.split_loc))
targets = targets + use_tree(features(dims, :), in, tree.child(2), discrete_dim(dims), Uc)
else
%Discrete feature
Uf= unique(features(dim,:))
for i = 1:length(Uf),
in = indices(find(features(dim, indices) == Uf(i)))
targets = targets + use_tree(features(dims, :), in, tree.child(i), discrete_dim(dims), Uc)
end
end
%END use_tree
function tree = make_tree(features, targets, inc_node, discrete_dim, maxNbin, base)%Build a tree recursively
[Ni, L] = size(features)Uc = unique(targets)
tree.dim = 0
%tree.child(1:maxNbin) = zeros(1,maxNbin)
tree.split_loc= inf
if isempty(features), %break
end
%When to stop: If the dimension is one or the number of examples is smallif ((inc_node >L) | (L == 1) | (length(Uc) == 1)),
H = hist(targets, length(Uc))
[m, largest] = max(H)
tree.child = Uc(largest)
%break
end
%Compute the node's Ifor i = 1:length(Uc),
Pnode(i) = length(find(targets == Uc(i))) / L
end
Inode = -sum(Pnode.*log(Pnode)/log(2))
%For each dimension, compute the gain ratio impurity%This is done separately for discrete and continuous features
delta_Ib= zeros(1, Ni)
split_loc = ones(1, Ni)*inf
for i = 1:Ni, data = features(i,:)
Nbins = length(unique(data))
if (discrete_dim(i)),
%This is a discrete feature
P = zeros(length(Uc), Nbins)
for j = 1:length(Uc),
for k = 1:Nbins,
indices = find((targets == Uc(j)) &(features(i,:) == k))
P(j,k) = length(indices)
end
end
Pk = sum(P)
P = P/L
Pk = Pk/sum(Pk)
info= sum(-P.*log(eps+P)/log(2))
delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2))
else
%This is a continuous feature
P = zeros(length(Uc), 2)
%Sort the features
[sorted_data, indices] = sort(data)
sorted_targets = targets(indices)
%Calculate the information for each possible split
I = zeros(1, L-1)
for j = 1:L-1,
for k =1:length(Uc),
P(k,1) = length(find(sorted_targets(1:j) == Uc(k)))
P(k,2) = length(find(sorted_targets(j+1:end) == Uc(k)))
end
Ps = sum(P)/L
P = P/L
info = sum(-P.*log(eps+P)/log(2))
I(j) = Inode - sum(info.*Ps)
end
[delta_Ib(i), s] = max(I)
split_loc(i) = sorted_data(s)
end
end
%Find the dimension minimizing delta_Ib [m, dim] = max(delta_Ib)
dims = 1:Ni
tree.dim = dim
%Split along the 'dim' dimensionNf = unique(features(dim,:))
Nbins = length(Nf)
if (discrete_dim(dim)),
%Discrete feature
for i = 1:Nbins,
indices = find(features(dim, :) == Nf(i))
tree.child(i) = make_tree(features(dims, indices), targets(indices), inc_node, discrete_dim(dims), maxNbin, base)
end
else
%Continuous feature
tree.split_loc = split_loc(dim)
indices1 = find(features(dim,:) <= split_loc(dim))
indices2 = find(features(dim,:) >split_loc(dim))
tree.child(1) = make_tree(features(dims, indices1), targets(indices1), inc_node, discrete_dim(dims), maxNbin)
tree.child(2) = make_tree(features(dims, indices2), targets(indices2), inc_node, discrete_dim(dims), maxNbin)
end
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)