⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 c4_5trainfun.m

📁 c4.5 关于决策树decision tree的matlab实现程序
💻 M
字号:
%C4_5TrainFun.m
%Shiliang Sun (shiliangsun@gmail.com), Apr. 8, 2007
%Learning a decision tree by the C4.5 algorithm
%This code is based on the C4_5.m file from "Classification Toolbox for Matlab"
%(http://www.yom-tov.info/cgi-bin/list_uploaded_files.pl). Several bugs have been fixed.


function tree = C4_5TrainFun(patterns, targets, inc_node, discrete_dim)
%Build a tree recursively
%  patterns     - Train patterns, (the number of features) * (the number of samples)
%  targets	    - Train targets, 1*(the number of samples)
%  inc_node     - Stop building a subtree if its total samples are less than inc_node
%  discrete_dim - 1*(the number of features). 0 entries for continuous features 

[Ni, L]    					= size(patterns);
Uc         					= unique(targets);
tree.dim					= 0;
tree.split_loc				= inf;

if isempty(patterns)
    return;
end

%When to stop: If the number of examples is smaller than inc_node, or is 1, or the category is unique
if ((inc_node > L) | (L == 1) | (length(Uc) == 1))
    tempvec1=zeros(1,length(Uc));
    for ii=1:length(Uc)
        tempvec1(ii)=length(find(targets==Uc(ii)));
    end
    [m, largest] = max(tempvec1);
    tree.child	 = Uc(largest);
    return
end

%Compute the node's entropy
for i = 1:length(Uc),
    Pnode(i) = length(find(targets == Uc(i))) / L;
end
Inode = -sum(Pnode.*log(Pnode)/log(2));

%For each dimension, compute the gain ratio impurity
%This is done separately for discrete and continuous patterns
delta_Ib    = zeros(1, Ni);
split_loc	= ones(1, Ni)*inf;

for i = 1:Ni,
    data	= patterns(i,:);
    Ud      = unique(data);
    Nbins	= length(Ud);
    if (discrete_dim(i)),
        %This is a discrete pattern
        P	= zeros(length(Uc), Nbins);
        for j = 1:length(Uc),
            for k = 1:Nbins,
                indices 	= find((targets == Uc(j)) & (patterns(i,:) == Ud(k)));
                P(j,k) 	= length(indices);
            end
        end
        Pk          = sum(P);
        P           = P./(eps+repmat(Pk,size(P,1),1)); 
        Pk          = Pk/sum(Pk);
        info        = sum(-P.*log(eps+P)/log(2));
        delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2));
    else
        %This is a continuous pattern
        P	= zeros(length(Uc), 2);

        %Sort the patterns
        [sorted_data, indices] = sort(data);
        sorted_targets = targets(indices);
        
        %Calculate the information for each possible split
        I	= zeros(1, L-1);
        for j = 1:L-1,
            for k =1:length(Uc),
                P(k,1) = length(find(sorted_targets(1:j) 	 == Uc(k)));
                P(k,2) = length(find(sorted_targets(j+1:end) == Uc(k)));
            end
            Pk      = sum(P); 
            P       = P./(eps+repmat(Pk,size(P,1),1)); 
            Pk      = Pk/sum(Pk);
            info	= sum(-P.*log(eps+P)/log(2)); 
            I(j)	= (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2)); 
        end
        [delta_Ib(i), s] = max(I);
        split_loc(i) = sorted_data(s); %<=threshold vs. >threshold
    end
end

%Find the dimension maximizing delta_Ib
[m, dim]    = max(delta_Ib);
dims        = 1:Ni;
tree.dim    = dim;

dims(dim)=[]; %The remaining attributes

%Split along the 'dim' dimension
Nf		= unique(patterns(dim,:));
Nbins	= length(Nf);
if (discrete_dim(dim)),
    %Discrete pattern
    for i = 1:Nbins,
        indices    	= find(patterns(dim, :) == Nf(i));
        if isempty(dims)
            tempvec1=zeros(1,length(Uc));
            for ii=1:length(Uc)
                tempvec1(ii)=length(find(targets(indices)==Uc(ii)));
            end
            [m, largest] 	= max(tempvec1);
            tree.child(i) 	= Uc(largest);
        else
            tree.child(i)	= C4_5TrainFun(patterns(dims, indices), targets(indices), inc_node, discrete_dim(dims));
        end
    end
else
    %Continuous pattern
    tree.split_loc	= split_loc(dim);
    indices1		   	= find(patterns(dim,:) <= split_loc(dim));
    indices2	   		= find(patterns(dim,:) > split_loc(dim));
    if isempty(dims)
        tempvec1=zeros(1,length(Uc));
        for ii=1:length(Uc)
            tempvec1(ii)=length(find(targets(indices1)==Uc(ii)));
        end
        [m, largest] 	= max(tempvec1);
        tree.child(1) 	= Uc(largest);
        tempvec1=zeros(1,length(Uc));
        for ii=1:length(Uc)
            tempvec1(ii)=length(find(targets(indices2)==Uc(ii)));
        end
        [m, largest] 	= max(tempvec1);
        tree.child(2) 	= Uc(largest);
    else
        tree.child(1)	= C4_5TrainFun(patterns(dims, indices1), targets(indices1), inc_node, discrete_dim(dims));
        tree.child(2)	= C4_5TrainFun(patterns(dims, indices2), targets(indices2), inc_node, discrete_dim(dims));
    end
end

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -