📄 preprocess.m
字号:
function [X, Y, num_data, num_feature] = preprocess(D)
global preprocess;
% sparse data
if (preprocess.Sparse == 1)
D = spconvert(D);
end
% Sampling
if ((preprocess.DataSampling == 1) & (preprocess.DataSamplingRate > 0))
[num_data, num_feature] = size(D);
Index = mod(1:num_data, preprocess.DataSamplingRate);
D = D(Index == fix((preprocess.DataSamplingRate - 1)/2), :);
end;
% Get the number of feature and data
[num_data, num_feature] = size(D);
% Convert NaN to 0
if (any(any(isnan(D))) == 1),
fprintf('Warning: the raw data contains NaN values! \n');
D(isnan(D)) = 0;
end;
% randomize the data
if (preprocess.Shuffled == 1) %Shuffle the datasets
Vec_rand = rand(num_data, 1);
[B, Index] = sort(Vec_rand);
D = D(Index, :);
end;
% Extract Shot ID information
if (preprocess.ShotAvailable == 1)
preprocess.ShotInfo = D(:, num_feature:num_feature);
preprocess.ShotIDSet = unique(preprocess.ShotInfo);
D = D(:, 1:num_feature-1);
num_feature = num_feature - 1;
fprintf('Shot Number: %d\n', length(unique(preprocess.ShotInfo)));
% Extract the key frame
if ((preprocess.DataSampling == 1) & (preprocess.DataSamplingRate <= 0))
D_sampling = zeros(length(ShotIDSet), num_feature);
for j = 1:length(ShotIDSet)
D_shotID = D(preprocess.ShotInfo == ShotIDSet(j), :);
med = fix((sum(preprocess.ShotInfo == ShotIDSet(j)) + 1)/2);
D_sampling(j, :) = D_shotID(med, :);
%D_sampling(j, :) = sum(D_shotID)/size(D_shotID, 1);
D_sampling(j, num_feature) = D_shotID(med, num_feature);
end;
D = D_sampling;
clear D_sampling;
[num_data, num_feature] = size(D);
preprocess.ShotInfo = preprocess.ShotIDSet;
fprintf('Data Number: %d, Feature Number: %d\n', num_data, num_feature);
end;
end;
% Partition the data collection
if ((preprocess.MultiClassType == 0) | (preprocess.MultiClass.LabelType == 1)),
actual_num_class = 1;
else
acutal_num_class = preprocess.MultiClass.NumClass;
end;
num_feature = num_feature - actual_num_class;
X = D(:, 1:num_feature);
Y = full(D(:, num_feature+1:num_feature + actual_num_class));
clear D;
fprintf('Data Number: %d, Feature Number: %d\n', num_data, num_feature);
% Obtain the class set
class_set = unique(Y); % Assume the class(1) is the positive class
if (all(class_set ~= 1)), fprintf('Error: 1 must be the positive label!\n'); end;
class_set(class_set == 1) = []; % 1 must be the positive class label;
class_set = [1; class_set];
preprocess.OrgClassSet = class_set;
% % Convert labels to continuous value
Y_dup = Y;
for i = 1:length(class_set)
Y_dup(Y == class_set(i)) = i;
end;
Y = Y_dup;
% If multi-class classification with single labels, expand the labels to
% multiple labels, now Y is a matrix of {0, 1}^(num_data*num_class)
if ((preprocess.MultiClassType ~= 0) & (preprocess.MultiClass.LabelType == 1)),
actual_num_class = length(class_set);
Y_convert = zeros(num_data, actual_num_class);
Y_convert((1:num_data)' + (Y - 1) * num_data) = 1;
Y = Y_convert;
end;
% Obtain the converted class set
class_set = unique(Y); % Assume the class(1) is the positive class
class_set(class_set == 1) = []; % 1 must be the positive class label;
class_set = [1; class_set];
preprocess.ClassSet = class_set;
% normalize the data set
if (preprocess.Normalization == 1)
mCov = std(X);
% mCov(preprocess.UnNormalizedAttr) = 1;
mCov(mCov == 0) = 1;
for i = 1 : num_data
X(i, :) = (X(i, :) ./ mCov);
end;
end;
% feature selection
if (preprocess.Sparse == 1)
X_sum = sum(X > 0);
[C, I] = sort(X_sum);
X_nonzero_col = full(sum(X_sum > 0));
num_feature = min(200, X_nonzero_col);
X = full(X(:, I(length(I) - num_feature + 1:length(I))));
end
if ((preprocess.SVD > 0) & (num_feature > preprocess.SVD)) % SVD reduction
SVD_dimension = preprocess.SVD;
fprintf('SVD: reduce to %d dimension\n', SVD_dimension);
[U, S, V] = svds(X', SVD_dimension);
X = (U'*X')';
num_feature = SVD_dimension;
end;
if (preprocess.ChiSquare > 0) & (actual_num_class == 1), % Chi-square feature selection, only for binary classification
for i = 1:num_feature,
datamean = mean(X(:, i));
train_pos = sum(X(Y == class_set(1), i) >= datamean);
train_neg = sum(X(Y ~= class_set(1), i) >= datamean);
train_all = train_pos + train_neg;
ntrain_pos = sum(X(Y == class_set(1), i) < datamean);
ntrain_neg = sum(X(Y ~= class_set(1), i) < datamean);
ntrain_all = ntrain_pos + ntrain_neg;
if (train_all == 0), fprintf('Error: in Chi squared feature selection\n'); continue; end;
if (ntrain_all == 0), fprintf('Error: in Chi squared feature selection\n'); continue; end;
truth_pos_per = sum(Y == class_set(1)) / num_data;
truth_neg_per = sum(Y ~= class_set(1)) / num_data;
truth_pos_per = (truth_pos_per == 0) * 1e-5 + truth_pos_per;
truth_neg_per = (truth_neg_per == 0) * 1e-5 + truth_neg_per;
chi(i) = (train_pos - train_all * truth_pos_per) ^ 2 / (train_all * truth_pos_per) + (train_neg - train_all * truth_neg_per) ^ 2 / (train_all * truth_neg_per) ...
+ (ntrain_pos - ntrain_all * truth_pos_per) ^ 2 / (ntrain_all * truth_pos_per) + (ntrain_neg - ntrain_all * truth_neg_per) ^ 2 / (ntrain_all * truth_neg_per);
end;
fprintf('Chi squared: '); fprintf('%.3f, ', chi); fprintf('\n');
X = X(:, chi >= preprocess.ChiSquare);
num_feature = size(X, 2);
end;
% read the constraints
if (preprocess.ConstraintAvailable == 1) & (preprocess.ShotAvailable == 1),
constraintMap = dlmread(preprocess.ConstraintFileName, ',');
preprocess.constraintMap = constraintMap;
preprocess.ConPair1 = []; preprocess.ConPair2 = []; preprocess.LabelPair = [];
for j = 1:size(constraintMap, 1),
preprocess.ConPair1(j, :) = mean(X(preprocess.ShotInfo == constraintMap(j, 1), :), 1);
preprocess.ConPair2(j, :) = mean(X(preprocess.ShotInfo == constraintMap(j, 2), :), 1);
preprocess.LabelPair(j) = constraintMap(j, 3);
end;
preprocess.constraintUsed = ones(size(constraintMap, 1), 1);
end;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -