preprocess.m
来自「Variable Reduction Testbench通过对变量进行相关性分析」· M 代码 · 共 200 行
M
200 行
function [ans_msg, out_X, out_H] = preprocess(X, H, delete_cols, substitute_missing, missing_value);
% -------------------------------------------------------------------------
% this code is part of the 'Reduction Testbench' suite
% developed by A. Manganaro, R. Todeschini, A. Ballabio, D. Mauri
% 2006 - Milano Chemometrics and QSAR Research Group
% -------------------------------------------------------------------------
%
%
% [ans_msg, out_X, out_H] = preprocess(X, H, delete_cols, substitute_missing, missing_value)
%
% preprocess checks the given dataset for constant columns (variables)
% values and for missing values (-999)
%
% Input:
% X = dataset [n x p] of n objects, p variables
% H = dataset's headers [1 x p]
% delete_cols = if set to 'y', constant columns in X are deleted
% substitute_missing = if set to 'y', missing values found in X are
% substituted with the mean value of the column
% missing_value = value of the missing value
%
% Output:
% ans_msg = string with the report of what the preprocessor did
% out_X = processed X (dataset)
% out_H = processed H (dataset's headers)
echo off;
[n,p] = size(X);
ans_msg = [];
if ( (p<2) | (n<2) )
disp('Wrong matrix dimension');
out_X = X;
out_H = H;
return;
end
try
[file_id mess] = fopen('preproc_log.txt', 'wt');
if (file_id==-1)
warning_box('Error opening the preprocessor output file');
return;
end
if strcmp(delete_cols,'y')
% Checks for constant variables (columns)
s = std(X);
const_var = find(s==0);
if ~isempty(const_var)
len_const_var = length(const_var);
for i=1:len_const_var
const_var(2,i) = X(1,const_var(1,i));
end
if isempty(H)
fprintf(file_id,'Columns with constant values:\n\n');
fprintf(file_id,' Var\tValue\n');
fprintf(file_id,'-------------------\n');
for i=1:len_const_var
fprintf(file_id,'%4.0d\t%4.3f\t\n',const_var(1,i),const_var(2,i));
end
fprintf(file_id,'\n\n');
else
% Formats the headers
tmp_h = [{'Name'} H];
[tmp_h max_len] = strings_format(tmp_h, 4, 35);
headers_title = tmp_h{1};
formatted_H = tmp_h(2:end);
fprintf(file_id,'Columns with constant values:\n\n');
fprintf(file_id,' Var\t%s\tValue\n',headers_title);
fprintf(file_id,'---------------------------------------------\n');
for i=1:len_const_var
fprintf(file_id,'%4.0d\t%s\t%4.3f\t\n',const_var(1,i),...
formatted_H{const_var(1,i)},const_var(2,i));
end
fprintf(file_id,'\n\n');
end
for i=len_const_var:-1:1
if const_var(1,i)==1
X = X(:,2:end);
if ~isempty(H) H = H(2:end); end
p = p-1;
elseif const_var(1,i)==p
X = X(:,1:(p-1));
if ~isempty(H) H = H(1:(p-1)); end
p = p-1;
else
X = X(:,[1:const_var(1,i)-1 const_var(1,i)+1:end]);
if ~isempty(H) H = H([1:const_var(1,i)-1 const_var(1,i)+1:end]); end
p = p-1;
end
end
ans_msg = ['Preprocessor found ' num2str(len_const_var) ...
' constant variables and deleted them. '];
end
end
if strcmp(substitute_missing,'y')
if isempty(missing_value)
missing_value = -999;
end
[missing_i missing_j] = find(X==missing_value);
if length(missing_i)>0
[n,p] = size(X);
for idx=1:p mean_vector(idx) = NaN; end;
fprintf(file_id,'Found %d missing values:\n',length(missing_i));
for idx = 1:length(missing_i)
if isnan(mean_vector(idx))
clean_vector = [];
for k=1:length(X(:,missing_j(idx)))
if ~(X(k,missing_j(idx)) == missing_value)
clean_vector = [clean_vector X(k,missing_j(idx))];
end
end
mean_vector(idx) = mean(clean_vector);
end
X(missing_i(idx),missing_j(idx)) = mean_vector(idx);
fprintf(file_id,'Missing value found at %d,%d\n',missing_i(idx),missing_j(idx));
end
ans_msg = [ans_msg ['Preprocessor found ' num2str(length(missing_i)) ...
' missing values and changed them to the mean value.']];
end
elseif strcmp(substitute_missing,'d')
if isempty(missing_value)
missing_value = -999;
end
[missing_i missing_j] = find(X==missing_value);
if length(missing_i)>0
[n,p] = size(X);
del_count = 0;
for idx=n:-1:1
if length(find(missing_i==idx))>0
fprintf(file_id,'Missing values on line %d - line deleted:\n',idx);
del_count = del_count + 1;
end
end
fprintf(file_id,'\nTotal: %d lines deleted\n',del_count);
ans_msg = [ans_msg ['Preprocessor found ' num2str(del_count) ...
' lines with missing values and deleted them.']];
end
end
echo on;
out_X = X;
out_H = H;
fclose(file_id);
catch
fclose('all');
warning_box('Error while writing the preprocessor output file');
end
% --------------------------------------------------------------------
function warning_box(cur_msg)
% Calls the msgbox function to show a warning
% cur_msg message string
msgbox({cur_msg});
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?