📄 learn_params.m

📁 贝叶斯算法(matlab编写) 安装,添加目录 /home/ai2/murphyk/matlab/FullBNT
💻 M
📖 第 1 页 / 共 2 页
字号:
上一页 12
best_attr=0;  %the attribute with the max_gainbest_split = []; %the split of T according to the value of best_attrcur_best_threshhold = 0; %the threshhold for split continuous attributebest_threshhold=0;% compute Info(T) (for discrete output)if (output_type == 0)  class_split_T = split_cases(fam_ev,node_sizes,node_types,T,size(fam_ev,1),0); %split cases according to class  info_T = compute_info (fam_ev, T, class_split_T);else % compute R(T) (for cts output)%  N = size(fam_ev,2);%  cases_T = fam_ev(size(fam_ev,1),T); %get the output value for cases T%  std_T = std(cases_T);%  avg_y_T = mean(cases_T);  sqr_T = cases_T - avg_y_T;  R_T = sum(sqr_T.*sqr_T)/N;  % get R(T) = 1/N * SUM(y-avg_y)^2  info_T = R_T;endfor i=1:(size_fam-1)  if (myismember(i,candidate_attrs))  %if this attribute still in the candidate attribute set    if (node_types(i)==0) %discrete attibute      split_T = split_cases(fam_ev,node_sizes,node_types,T,i,0); %split cases according to value of attribute i      % For cts output, we compute the least square gain.      % For discrete output, we compute gain ratio      cur_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,0,output_type); %gain ratio    else %cts attribute      %get the values of this attribute      ev = fam_ev(:,T);      values = ev(i,:);      sort_v = sort(values);         %remove the duplicate values in sort_v      v_set = unique(sort_v);        best_gain = 0;      best_threshhold = 0;      best_split1 = [];            %find the best split for this cts attribute      % see "Quilan 96: Improved Use of Continuous Attributes in C4.5"      for j=1:(size(v_set,2)-1)        mid_v = (v_set(j)+v_set(j+1))/2;         split_T = split_cases(fam_ev,node_sizes,node_types,T,i,mid_v); %split cases according to value of attribute i (<=mid_v)        % For cts output, we compute the least square gain.        % For discrete output, we use Quilan 96: use information gain instead of gain ratio to select threshhold        cur_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,1,output_type);         %if (i==6)        %  fprintf('gain %8.5f threshhold %6.3f spliting %d\n', cur_gain, mid_v, size(split_T{1},2));        %end        if (best_gain < cur_gain)          best_gain = cur_gain;          best_threshhold = mid_v;          %best_split1 = split_T;     %here we need to copy array, not good!!! (maybe we can compute after we get best_attr        end      end      %recalculate the gain_ratio of the best_threshhold      split_T = split_cases(fam_ev,node_sizes,node_types,T,i,best_threshhold);      best_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,0,output_type); %gain_ratio      if (output_type==0) %for discrete output        cur_gain = best_gain-log2(size(v_set,2)-1)/size_t; % Quilan 96: use the gain_ratio-log2(N-1)/|D| as the gain of this attr      else                %for cts output        cur_gain = best_gain;      end    end        if (max_gain < cur_gain)      max_gain = cur_gain;      best_attr = i;      cur_best_threshhold=best_threshhold;  %save the threshhold      %best_split = split_T;        %here we need to copy array, not good!!! So we will recalculate in below line 313    end  endend% stop splitting if gain is too smallif (max_gain==0 | (output_type==0 & max_gain < min_gain) | (output_type==1 & max_gain < cts_min_gain))   if (output_type==0)    tree.nodes(tree.num_node).probs=class_freqs/size_t; %the prob for each value of class node     tree.nodes(tree.num_node).error.all_error=1-top1_class_cases/size_t;      tree.nodes(tree.num_node).error.all_error_num=size_t - top1_class_cases;    fprintf('Create leaf node(nogain) %d. Class %d Cases %d Error %d \n',tree.num_node, top1_class, size_t, size_t - top1_class_cases );  else    fprintf('Create leaf node(nogain) %d. Mean %8.4f Std %8.4f Cases %d \n',tree.num_node, avg_y_T, std_T, size_t);  end  return;end%get the split of cases according to the best split attributeif (node_types(best_attr)==0) %discrete attibute  best_split = split_cases(fam_ev,node_sizes,node_types,T,best_attr,0);  else    best_split = split_cases(fam_ev,node_sizes,node_types,T,best_attr,cur_best_threshhold);end  %(4) best_attr = AttributeWithBestGain;%(5) if best_attr is continuous             ???? why need this? maybe the value in the decision tree must appeared in data%       find threshhold in all cases that <= max_V%    change the split of Ttree.nodes(tree.num_node).split_id=best_attr;tree.nodes(tree.num_node).split_threshhold=cur_best_threshhold; %for cts attribute only%note: below threshhold rejust is linera search, so it is slow. A better method is described in paper "Efficient C4.5"%if (output_type==0)if (node_types(best_attr)==1)  %is a continuous attribute  %find the value that approximate best_threshhold from below (the largest that <= best_threshhold)  best_value=0;  for i=1:size(fam_ev,2)  %note: need to search in all cases for all tree, not just in cases for this node    val = fam_ev(best_attr,i);    if (val <= cur_best_threshhold & val > best_value) %val is more clear to best_threshhold      best_value=val;    end  end  tree.nodes(tree.num_node).split_threshhold=best_value; %for cts attribute onlyend%end  if (output_type == 0)  fprintf('Create node %d split at %d gain %8.4f Th %d. Class %d Cases %d Error %d \n',tree.num_node, best_attr, max_gain, tree.nodes(tree.num_node).split_threshhold, top1_class, size_t, size_t - top1_class_cases );else  fprintf('Create node %d split at %d gain %8.4f Th %d. Mean %8.4f Cases %d\n',tree.num_node, best_attr, max_gain, tree.nodes(tree.num_node).split_threshhold, avg_y_T, size_t );end  %(6) Foreach T' in the split_T%        if T' is Empty%            Child of node_id is a leaf%        else%            Child of node_id = split_tree (T')tree.nodes(new_node).is_leaf=0; %because this node will be split, it is not leaf nowfor i=1:size(best_split,2)  if (size(best_split{i},2)==0) %T(i) is empty    %create one new leaf node    tree.num_node=tree.num_node+1;    tree.nodes(tree.num_node).used=1; %flag this node is used (0 means node not used, it will be removed from tree at last to save memory)    tree.nodes(tree.num_node).is_leaf=1;    tree.nodes(tree.num_node).children=[];    tree.nodes(tree.num_node).split_id=0;    tree.nodes(tree.num_node).split_threshhold=0;      if (output_type == 0)      tree.nodes(tree.num_node).probs=zeros(1,num_cat); %the prob for each value of class node       tree.nodes(tree.num_node).probs(top1_class)=1; %use the majority class of parent node, like for binary class,                                                    %and majority is class 2, then the CPT is [0 1]                                                   %we may need to use prior to do smoothing, to get [0.001 0.999]      tree.nodes(tree.num_node).error.self_error=0;       tree.nodes(tree.num_node).error.all_error=0;        tree.nodes(tree.num_node).error.all_error_num=0;    else      tree.nodes(tree.num_node).mean = avg_y_T; %just use parent node's mean value      tree.nodes(tree.num_node).std = std_T;    end    %add the new leaf node to parents    num_children=size(tree.nodes(new_node).children,2);    tree.nodes(new_node).children(num_children+1)=tree.num_node;    if (output_type==0)      fprintf('Create leaf node(nullset) %d. %d-th child of Father %d Class %d\n',tree.num_node, i, new_node, top1_class );    else      fprintf('Create leaf node(nullset) %d. %d-th child of Father %d \n',tree.num_node, i, new_node );    end  else    if (node_types(best_attr)==0)  % if attr is discrete, it should be removed from the candidate set        new_candidate_attrs = mysetdiff(candidate_attrs,[best_attr]);    else      new_candidate_attrs = candidate_attrs;    end    new_sub_node = split_dtree (CPD, fam_ev, node_sizes, node_types, stop_cases, min_gain, best_split{i}, new_candidate_attrs, num_cat);      %tree.nodes(parent_id).error.all_error += tree.nodes(new_sub_node).error.all_error;    fprintf('Add subtree node %d to %d. #nodes %d\n',new_sub_node,new_node, tree.num_node );%   tree.nodes(new_node).error.all_error_num = tree.nodes(new_node).error.all_error_num + tree.nodes(new_sub_node).error.all_error_num;    %add the new leaf node to parents    num_children=size(tree.nodes(new_node).children,2);    tree.nodes(new_node).children(num_children+1)=new_sub_node;  endend     %(7) Compute errors of N; for doing pruning%    get the total error for the subtreeif (output_type==0)  tree.nodes(new_node).error.all_error=tree.nodes(new_node).error.all_error_num/size_t;end%doing pruning, but doing here is not so efficient, because it is bottom up.%if tree.nodes()%after doing pruning, need to update the all_error to self_error%(8) Return N  %(1) For discrete output, we use GainRatio defined as below%  			         Gain(X,T)% 	GainRatio(X,T) = ----------% 			         SplitInfo(X,T)%   where%   Gain(X,T) = Info(T) - Info(X,T)%    				                       |Ti|% 	Info(X,T) = Sum for i from 1 to n of ( ---- * Info(Ti))%                                          |T| 			 %   SplitInfo(D,T) is the information due to the split of T on the basis%    of the value of the categorical attribute D. Thus SplitInfo(D,T) is%  		 I(|T1|/|T|, |T2|/|T|, .., |Tm|/|T|)%    where {T1, T2, .. Tm} is the partition of T induced by the value of D.%   Definition of Info(Ti)%     If a set T of records is partitioned into disjoint exhaustive classes C1, C2, .., Ck on the basis of the %     value of the categorical attribute, then the information needed to identify the class of an element of T %     is Info(T) = I(P), where P is the probability distribution of the partition (C1, C2, .., Ck): %     	P = (|C1|/|T|, |C2|/|T|, ..., |Ck|/|T|)%     Here I(P) is defined as%       I(P) = -(p1*log(p1) + p2*log(p2) + .. + pn*log(pn))% %(2) For continuous output (regression tree), we use least squares score (adapted from Leo Breiman's book "Classification and regression trees", page 231%    The original support only binary split, we further extend it to permit multiple-child split%                                        %     Delta_R = R(T) - Sum for all childe nodes Ti (R(Ti))%     Where R(Ti)= 1/N * Sum for all cases i in node Ti ((yi - avg_y(Ti))^2)%     here N is the number of all training cases for construct the regression tree%          avg_y(Ti) is the average value for output variable for the cases in node Tifunction gain_score = compute_gain (fam_ev, node_sizes, node_types, T, info_T, attr_id, split_T, score_type, output_type)% COMPUTE_GAIN Compute the score for the split of cases T using attribute attr_id% gain_score = compute_gain (fam_ev, T, attr_id, node_size, method)%% fam_ev(i,j)  is the value of attribute i in j-th training cases, the last row is for the class label (self_ev)% T(i) is the index of i-th cases in current decision tree node, we need split it further% attr_id is the index of current node considered for a split% split_T{i} is the i_th subset in partition of cases T according to the value of attribute attr_id% score_type if 0, is gain ratio, 1 is information gain (only apply to discrete output)% node_size(i) the node size of i-th node in the family% output_type: 0 means discrete output, 1 means continuous output.gain_score=0;% ***********for DISCRETE output*******************************************************if (output_type == 0)  % compute Info(T)  total_cnt = size(T,2);  if (total_cnt==0)    return;  end;  %class_split_T = split_cases(fam_ev,node_sizes,node_types,T,size(fam_ev,1),0); %split cases according to class  %info_T = compute_info (fam_ev, T, class_split_T);  % compute Info(X,T)  num_class = size(split_T,2);   subset_sizes = zeros(1,num_class);  info_ti = zeros(1,num_class);  for i=1:num_class    subset_sizes(i)=size(split_T{i},2);    if (subset_sizes(i)~=0)      class_split_Ti = split_cases(fam_ev,node_sizes,node_types,split_T{i},size(fam_ev,1),0); %split cases according to class      info_ti(i) = compute_info(fam_ev, split_T{i}, class_split_Ti);    end  end      ti_ratios = subset_sizes/total_cnt;  %get the |Ti|/|T|  info_X_T = sum(ti_ratios.*info_ti);  %get Gain(X,T)  gain_X_T = info_T - info_X_T;  if (score_type == 1) %information gain    gain_score=gain_X_T;    return;  end  %compute the SplitInfo(X,T)   //is this also for cts attr, only split into two subsets  splitinfo_T = compute_info (fam_ev, T, split_T);  if (splitinfo_T~=0)    gain_score = gain_X_T/splitinfo_T;  end% ************for continuous output**************************************************else   N = size(fam_ev,2);  % compute R(Ti)  num_class = size(split_T,2);   R_Ti = zeros(1,num_class);  for i=1:num_class    if (size(split_T{i},2)~=0)      cases_T = fam_ev(size(fam_ev,1),split_T{i});      avg_y_T = mean(cases_T);      sqr_T = cases_T - avg_y_T;      R_Ti(i) = sum(sqr_T.*sqr_T)/N;  % get R(Ti) = 1/N * SUM(y-avg_y)^2    end  end  %delta_R = R(T) - SUM(R(Ti))  gain_score = info_T - sum(R_Ti);end%   Definition of Info(Ti)%     If a set T of records is partitioned into disjoint exhaustive classes C1, C2, .., Ck on the basis of the %     value of the categorical attribute, then the information needed to identify the class of an element of T %     is Info(T) = I(P), where P is the probability distribution of the partition (C1, C2, .., Ck): %     	P = (|C1|/|T|, |C2|/|T|, ..., |Ck|/|T|)%     Here I(P) is defined as%       I(P) = -(p1*log(p1) + p2*log(p2) + .. + pn*log(pn))function info = compute_info (fam_ev, T, split_T)% COMPUTE_INFO compute the information for the split of T into split_T% info = compute_info (fam_ev, T, split_T)total_cnt = size(T,2);num_class = size(split_T,2);subset_sizes = zeros(1,num_class);probs = zeros(1,num_class);log_probs = zeros(1,num_class);for i=1:num_class  subset_sizes(i)=size(split_T{i},2);end    probs = subset_sizes/total_cnt;%log_probs = log2(probs);  % if probs(i)=0, the log2(probs(i)) will be Inffor i=1:size(probs,2)  if (probs(i)~=0)    log_probs(i)=log2(probs(i));  endendinfo = sum(-(probs.*log_probs));
上一页 12
💿 文件大小 1681 K
👤 上传用户 zhiver
📂 所属分类数学计算
🏷️ 相关标签

#matlab #FullBNT #murphyk #home
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -