preprocess.m

来自「一个matlab的工具包,里面包括一些分类器例如 KNN KMEAN SVM 」· M 代码 · 共 163 行

163 行

function [X, Y, num_data, num_feature] = preprocess(D)

global preprocess;

% sparse data
if (preprocess.Sparse == 1)
   D = spconvert(D);
end

% Sampling 
if ((preprocess.DataSampling == 1) & (preprocess.DataSamplingRate > 0))
    [num_data, num_feature] = size(D);
    Index = mod(1:num_data, preprocess.DataSamplingRate);
    D = D(Index == fix((preprocess.DataSamplingRate - 1)/2), :);      
end;

% Get the number of feature and data
[num_data, num_feature] = size(D);

% Convert NaN to 0 
if (any(any(isnan(D))) == 1),
	fprintf('Warning: the raw data contains NaN values! \n');
	D(isnan(D)) = 0; 
end;

% randomize the data
if (preprocess.Shuffled == 1) %Shuffle the datasets
    Vec_rand = rand(num_data, 1);
    [B, Index] = sort(Vec_rand);
    D = D(Index, :);
end;

% Extract Shot ID information
if (preprocess.ShotAvailable == 1) 
    preprocess.ShotInfo = D(:, num_feature:num_feature);
    preprocess.ShotIDSet = unique(preprocess.ShotInfo);
    D = D(:, 1:num_feature-1);
    num_feature = num_feature - 1;
    fprintf('Shot Number: %d\n', length(unique(preprocess.ShotInfo)));
    
    % Extract the key frame
    if ((preprocess.DataSampling == 1) & (preprocess.DataSamplingRate <= 0)) 
        D_sampling = zeros(length(ShotIDSet), num_feature);
        for j = 1:length(ShotIDSet)
            D_shotID = D(preprocess.ShotInfo == ShotIDSet(j), :);
            med = fix((sum(preprocess.ShotInfo == ShotIDSet(j)) + 1)/2);
            D_sampling(j, :) = D_shotID(med, :);
            %D_sampling(j, :) = sum(D_shotID)/size(D_shotID, 1);
            D_sampling(j, num_feature) = D_shotID(med, num_feature);
        end;
        D = D_sampling;
        clear D_sampling;
        
        [num_data, num_feature] = size(D);
        preprocess.ShotInfo = preprocess.ShotIDSet;
        fprintf('Data Number: %d, Feature Number: %d\n', num_data, num_feature);
    end;    
end;

% Partition the data collection
if ((preprocess.MultiClassType == 0) | (preprocess.MultiClass.LabelType == 1)),  
    actual_num_class = 1;
else
    acutal_num_class = preprocess.MultiClass.NumClass;
end;

num_feature = num_feature - actual_num_class;
X = D(:, 1:num_feature);
Y = full(D(:, num_feature+1:num_feature + actual_num_class));
clear D;
fprintf('Data Number: %d, Feature Number: %d\n', num_data, num_feature);

% Obtain the class set
class_set = unique(Y); % Assume the class(1) is the positive class
if (all(class_set ~= 1)), fprintf('Error: 1 must be the positive label!\n'); end;
class_set(class_set == 1) = []; % 1 must be the positive class label;
class_set = [1; class_set];
preprocess.OrgClassSet = class_set;

% % Convert labels to continuous value
Y_dup = Y;
for i = 1:length(class_set)
    Y_dup(Y == class_set(i)) = i;    
end;
Y = Y_dup;

% If multi-class classification with single labels, expand the labels to
% multiple labels, now Y is a matrix of {0, 1}^(num_data*num_class) 
if ((preprocess.MultiClassType ~= 0) & (preprocess.MultiClass.LabelType == 1)),  
    actual_num_class = length(class_set);
    Y_convert = zeros(num_data, actual_num_class);
    Y_convert((1:num_data)' + (Y - 1) * num_data) = 1;
    Y = Y_convert;        
end;

% Obtain the converted class set
class_set = unique(Y); % Assume the class(1) is the positive class
class_set(class_set == 1) = []; % 1 must be the positive class label;
class_set = [1; class_set];
preprocess.ClassSet = class_set;

% normalize the data set
if (preprocess.Normalization == 1) 
    mCov = std(X);
    % mCov(preprocess.UnNormalizedAttr) = 1;
    mCov(mCov == 0) = 1;
    for i = 1 : num_data
        X(i, :) =  (X(i, :) ./ mCov);
    end;
end;

% feature selection
if (preprocess.Sparse == 1)
   X_sum = sum(X > 0);
   [C, I] = sort(X_sum);
   X_nonzero_col = full(sum(X_sum > 0));
   num_feature = min(200, X_nonzero_col);   	    
   X = full(X(:, I(length(I) - num_feature + 1:length(I))));
end

if ((preprocess.SVD > 0) & (num_feature > preprocess.SVD)) % SVD reduction
    SVD_dimension = preprocess.SVD;
    fprintf('SVD: reduce to %d dimension\n', SVD_dimension);
    [U, S, V] = svds(X', SVD_dimension);
    X = (U'*X')';
    num_feature = SVD_dimension;
end;

if (preprocess.ChiSquare > 0) & (actual_num_class == 1), % Chi-square feature selection, only for binary classification 
   for  i = 1:num_feature,
        datamean = mean(X(:, i));
        train_pos = sum(X(Y == class_set(1), i) >= datamean);
        train_neg = sum(X(Y ~= class_set(1), i) >= datamean);
        train_all = train_pos + train_neg;
        ntrain_pos = sum(X(Y == class_set(1), i) < datamean);
        ntrain_neg = sum(X(Y ~= class_set(1), i) < datamean);
        ntrain_all = ntrain_pos + ntrain_neg;
        if (train_all == 0), fprintf('Error: in Chi squared feature selection\n'); continue; end;
        if (ntrain_all == 0), fprintf('Error: in Chi squared feature selection\n'); continue;  end;
        truth_pos_per = sum(Y == class_set(1)) / num_data;
        truth_neg_per = sum(Y ~= class_set(1)) / num_data;
        truth_pos_per = (truth_pos_per == 0) * 1e-5 + truth_pos_per;
        truth_neg_per = (truth_neg_per == 0) * 1e-5 + truth_neg_per;
        chi(i) = (train_pos - train_all * truth_pos_per) ^ 2 / (train_all * truth_pos_per) + (train_neg - train_all * truth_neg_per) ^ 2 / (train_all * truth_neg_per) ...
               + (ntrain_pos - ntrain_all * truth_pos_per) ^ 2 / (ntrain_all * truth_pos_per) + (ntrain_neg - ntrain_all * truth_neg_per) ^ 2 / (ntrain_all * truth_neg_per);
   end;
   fprintf('Chi squared: '); fprintf('%.3f, ', chi); fprintf('\n');
   X = X(:, chi >= preprocess.ChiSquare);
   num_feature = size(X, 2);
end;

% read the constraints
if (preprocess.ConstraintAvailable == 1) & (preprocess.ShotAvailable == 1),  
    constraintMap = dlmread(preprocess.ConstraintFileName, ',');
    preprocess.constraintMap = constraintMap;
    preprocess.ConPair1 = []; preprocess.ConPair2 = []; preprocess.LabelPair = [];
    for j = 1:size(constraintMap, 1),
           preprocess.ConPair1(j, :) = mean(X(preprocess.ShotInfo == constraintMap(j, 1), :), 1);
           preprocess.ConPair2(j, :) = mean(X(preprocess.ShotInfo == constraintMap(j, 2), :), 1);
           preprocess.LabelPair(j) = constraintMap(j, 3);
    end;
    preprocess.constraintUsed = ones(size(constraintMap, 1), 1);
end;

preprocess.m - 源码说明

本页面展示了「一个matlab的工具包,里面包括一些分类器例如 KNN KMEAN SVM NETLAB 等等有很多.」中的 preprocess.m 源码文件，采用 M 编程语言编写，共 163 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与matlab相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?