preprocess.m

来自「Variable Reduction Testbench通过对变量进行相关性分析」· M 代码 · 共 200 行

M
200
字号

function [ans_msg, out_X, out_H] = preprocess(X, H, delete_cols, substitute_missing, missing_value);

% -------------------------------------------------------------------------
% this code is part of the 'Reduction Testbench' suite
% developed by A. Manganaro, R. Todeschini, A. Ballabio, D. Mauri
% 2006 - Milano Chemometrics and QSAR Research Group
% -------------------------------------------------------------------------
%
%
% [ans_msg, out_X, out_H] = preprocess(X, H, delete_cols, substitute_missing, missing_value)
%
% preprocess checks the given dataset for constant columns (variables)
% values and for missing values (-999)
%
% Input:
% X = dataset [n x p]  of n objects, p variables
% H = dataset's headers [1 x p]
% delete_cols = if set to 'y', constant columns in X are deleted
% substitute_missing = if set to 'y', missing values found in X are
%   substituted with the mean value of the column
% missing_value = value of the missing value
%
% Output:
% ans_msg = string with the report of what the preprocessor did
% out_X = processed X (dataset)
% out_H = processed H (dataset's headers)


echo off;

[n,p] = size(X);
ans_msg = [];

if ( (p<2) | (n<2) )
    disp('Wrong matrix dimension');
    out_X = X;
    out_H = H;
    return;
end


try
    [file_id mess] = fopen('preproc_log.txt', 'wt');
    if (file_id==-1)
        warning_box('Error opening the preprocessor output file');
        return;
    end

    if strcmp(delete_cols,'y')
        
        % Checks for constant variables (columns)
    
        s = std(X);
        const_var = find(s==0);

        if ~isempty(const_var)

            len_const_var = length(const_var);

            for i=1:len_const_var
                const_var(2,i) = X(1,const_var(1,i));
            end
    
            if isempty(H)
                fprintf(file_id,'Columns with constant values:\n\n');
                fprintf(file_id,' Var\tValue\n');
                fprintf(file_id,'-------------------\n');
                for i=1:len_const_var
                    fprintf(file_id,'%4.0d\t%4.3f\t\n',const_var(1,i),const_var(2,i));
                end
                fprintf(file_id,'\n\n');
            else
                % Formats the headers            
                tmp_h = [{'Name'} H];
                [tmp_h max_len] = strings_format(tmp_h, 4, 35);
                headers_title = tmp_h{1};
                formatted_H = tmp_h(2:end);

                fprintf(file_id,'Columns with constant values:\n\n');
                fprintf(file_id,' Var\t%s\tValue\n',headers_title);
                fprintf(file_id,'---------------------------------------------\n');
                for i=1:len_const_var
                    fprintf(file_id,'%4.0d\t%s\t%4.3f\t\n',const_var(1,i),...
                        formatted_H{const_var(1,i)},const_var(2,i));
                end
                fprintf(file_id,'\n\n');
            end

            for i=len_const_var:-1:1
                if const_var(1,i)==1
                    X = X(:,2:end);
                    if ~isempty(H) H = H(2:end); end
                    p = p-1;
                elseif  const_var(1,i)==p
                    X = X(:,1:(p-1));
                    if ~isempty(H) H = H(1:(p-1)); end
                    p = p-1;
                else
                    X = X(:,[1:const_var(1,i)-1 const_var(1,i)+1:end]);
                    if ~isempty(H) H = H([1:const_var(1,i)-1 const_var(1,i)+1:end]); end
                    p = p-1;
                end
            end
            
            ans_msg = ['Preprocessor found ' num2str(len_const_var) ...
                    ' constant variables and deleted them. '];

        end
        
    end


    if strcmp(substitute_missing,'y')
        
        if isempty(missing_value)
            missing_value = -999;
        end
        
        [missing_i missing_j] = find(X==missing_value);
        
        if length(missing_i)>0

            [n,p] = size(X);
            for idx=1:p mean_vector(idx) = NaN; end;
            fprintf(file_id,'Found %d missing values:\n',length(missing_i));
            
            
            for idx = 1:length(missing_i)
                
                if isnan(mean_vector(idx))
                    clean_vector = [];
                    for k=1:length(X(:,missing_j(idx)))
                        if ~(X(k,missing_j(idx)) == missing_value)
                            clean_vector = [clean_vector X(k,missing_j(idx))];
                        end
                    end
                    mean_vector(idx) = mean(clean_vector);
                end
                
                X(missing_i(idx),missing_j(idx)) = mean_vector(idx);
                
                fprintf(file_id,'Missing value found at %d,%d\n',missing_i(idx),missing_j(idx));
            end

            
            ans_msg = [ans_msg ['Preprocessor found ' num2str(length(missing_i)) ...
                        ' missing values and changed them to the mean value.']];

        end
        
    elseif strcmp(substitute_missing,'d')
        
        if isempty(missing_value)
            missing_value = -999;
        end
        
        [missing_i missing_j] = find(X==missing_value);
        
        if length(missing_i)>0
            
            [n,p] = size(X);
            del_count = 0;
            for idx=n:-1:1
                if length(find(missing_i==idx))>0
                    fprintf(file_id,'Missing values on line %d - line deleted:\n',idx);
                    del_count = del_count + 1;
                end
            end
            
            fprintf(file_id,'\nTotal: %d lines deleted\n',del_count);
            ans_msg = [ans_msg ['Preprocessor found ' num2str(del_count) ...
                        ' lines with missing values and deleted them.']];

        end
        
    end
        
    
    echo on;

    out_X = X;
    out_H = H;

    fclose(file_id);

catch
    fclose('all');
    warning_box('Error while writing the preprocessor output file');
end


% --------------------------------------------------------------------
    
    
function warning_box(cur_msg)
% Calls the msgbox function to show a warning
% cur_msg   message string

msgbox({cur_msg});

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?