📄 c4_5trainfun.m
字号:
%C4_5TrainFun.m
%Shiliang Sun (shiliangsun@gmail.com), Apr. 8, 2007
%Learning a decision tree by the C4.5 algorithm
%This code is based on the C4_5.m file from "Classification Toolbox for Matlab"
%(http://www.yom-tov.info/cgi-bin/list_uploaded_files.pl). Several bugs have been fixed.
function tree = C4_5TrainFun(patterns, targets, inc_node, discrete_dim)
%Build a tree recursively
% patterns - Train patterns, (the number of features) * (the number of samples)
% targets - Train targets, 1*(the number of samples)
% inc_node - Stop building a subtree if its total samples are less than inc_node
% discrete_dim - 1*(the number of features). 0 entries for continuous features
[Ni, L] = size(patterns);
Uc = unique(targets);
tree.dim = 0;
tree.split_loc = inf;
if isempty(patterns)
return;
end
%When to stop: If the number of examples is smaller than inc_node, or is 1, or the category is unique
if ((inc_node > L) | (L == 1) | (length(Uc) == 1))
tempvec1=zeros(1,length(Uc));
for ii=1:length(Uc)
tempvec1(ii)=length(find(targets==Uc(ii)));
end
[m, largest] = max(tempvec1);
tree.child = Uc(largest);
return
end
%Compute the node's entropy
for i = 1:length(Uc),
Pnode(i) = length(find(targets == Uc(i))) / L;
end
Inode = -sum(Pnode.*log(Pnode)/log(2));
%For each dimension, compute the gain ratio impurity
%This is done separately for discrete and continuous patterns
delta_Ib = zeros(1, Ni);
split_loc = ones(1, Ni)*inf;
for i = 1:Ni,
data = patterns(i,:);
Ud = unique(data);
Nbins = length(Ud);
if (discrete_dim(i)),
%This is a discrete pattern
P = zeros(length(Uc), Nbins);
for j = 1:length(Uc),
for k = 1:Nbins,
indices = find((targets == Uc(j)) & (patterns(i,:) == Ud(k)));
P(j,k) = length(indices);
end
end
Pk = sum(P);
P = P./(eps+repmat(Pk,size(P,1),1));
Pk = Pk/sum(Pk);
info = sum(-P.*log(eps+P)/log(2));
delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2));
else
%This is a continuous pattern
P = zeros(length(Uc), 2);
%Sort the patterns
[sorted_data, indices] = sort(data);
sorted_targets = targets(indices);
%Calculate the information for each possible split
I = zeros(1, L-1);
for j = 1:L-1,
for k =1:length(Uc),
P(k,1) = length(find(sorted_targets(1:j) == Uc(k)));
P(k,2) = length(find(sorted_targets(j+1:end) == Uc(k)));
end
Pk = sum(P);
P = P./(eps+repmat(Pk,size(P,1),1));
Pk = Pk/sum(Pk);
info = sum(-P.*log(eps+P)/log(2));
I(j) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2));
end
[delta_Ib(i), s] = max(I);
split_loc(i) = sorted_data(s); %<=threshold vs. >threshold
end
end
%Find the dimension maximizing delta_Ib
[m, dim] = max(delta_Ib);
dims = 1:Ni;
tree.dim = dim;
dims(dim)=[]; %The remaining attributes
%Split along the 'dim' dimension
Nf = unique(patterns(dim,:));
Nbins = length(Nf);
if (discrete_dim(dim)),
%Discrete pattern
for i = 1:Nbins,
indices = find(patterns(dim, :) == Nf(i));
if isempty(dims)
tempvec1=zeros(1,length(Uc));
for ii=1:length(Uc)
tempvec1(ii)=length(find(targets(indices)==Uc(ii)));
end
[m, largest] = max(tempvec1);
tree.child(i) = Uc(largest);
else
tree.child(i) = C4_5TrainFun(patterns(dims, indices), targets(indices), inc_node, discrete_dim(dims));
end
end
else
%Continuous pattern
tree.split_loc = split_loc(dim);
indices1 = find(patterns(dim,:) <= split_loc(dim));
indices2 = find(patterns(dim,:) > split_loc(dim));
if isempty(dims)
tempvec1=zeros(1,length(Uc));
for ii=1:length(Uc)
tempvec1(ii)=length(find(targets(indices1)==Uc(ii)));
end
[m, largest] = max(tempvec1);
tree.child(1) = Uc(largest);
tempvec1=zeros(1,length(Uc));
for ii=1:length(Uc)
tempvec1(ii)=length(find(targets(indices2)==Uc(ii)));
end
[m, largest] = max(tempvec1);
tree.child(2) = Uc(largest);
else
tree.child(1) = C4_5TrainFun(patterns(dims, indices1), targets(indices1), inc_node, discrete_dim(dims));
tree.child(2) = C4_5TrainFun(patterns(dims, indices2), targets(indices2), inc_node, discrete_dim(dims));
end
end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -