⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 matlab_tutorial.m

📁 matlab工具箱的应用演示程序,
💻 M
📖 第 1 页 / 共 2 页
字号:
A
pos = find(A(:, 2) > 5)
a_found = A(pos, :) % print these rows

% another way to the same thing
a_foud1 = A(find(A(:, 2) > 5), :)

% generate a matrix of random numbers (uniform distribution)
B = rand(5000, 5);
hist(B(:, 1), 20);   % histogram of column 1 in 20 bins

% generate a matrix of random numbers (normal dustribution)
B = randn(5000, 5);
hist(B(:, 5), 30);   % histogram of column 5 in 30 bins

%***********************************************************
%***********************************************************
%***********************************************************
% DATA MINING PROBLEMS
%***********************************************************
%***********************************************************
%***********************************************************

%***********************************************************
% 1. Preliminaries: Loading Data and Analysis
%***********************************************************

% this is the same as "clear all"
clear

% load the data (13 features, 1 target variable)
D = load('housing.dat');

% get the number of data points (examples) and features
[num_examples, num_cols] = size(D)
num_features = num_cols - 1;

% look at the histogram of the first feature and target variable
hist(D(:, 1), 15);  % numerical feature
hist(D(:, 4), 15);  % binary feature (categorical, in general)
hist(D(:, 14), 15); % target

% caclulate correlation coefficient between columns 4 and 7
corrcoef(D(:, [4 7]))

% visualize linear correlation between 3 features and a target
plotmatrix(D(:, [4 6 13 14]))

% make a box plot for columns 3 and 6 in the same figure
boxplot(D(:, [3 6]))

%***********************************************************
% 2. Classification Problem: Logistic Regression
%***********************************************************

clear

% get the dataset
D = load('pima.txt');
[pima_row, pima_col] = size(D)
num_features = pima_col - 1

% before prediction, target classes need to be 0-1 (not 1-2 as in the dataset)
pos = find(D(:, size(D, 2)) == 2);
D(pos, size(D, 2)) = 0;

% now, use 30% for the test set
[train, test] = divideset(D, 30);     

% normalize training set
[meanv, stdv, train(:, 1 : num_features)] = normalize(train(:, 1 : num_features), [], []);

% normalize test set (notice that 'meanv' and 'stdv' from the training are used to normalize)
[meanv, stdv, test(:, 1 : num_features)] = normalize(test(:, 1 : num_features), meanv, stdv);

% add a column of ones into both training and test sets
train = [train(:, 1 : num_features) ones(size(train, 1), 1) train(:, num_features + 1)];
test = [test(:, 1 : num_features) ones(size(test, 1), 1) test(:, num_features + 1)];

% train logistic regression model
LR_model = logit(train(:, size(train, 2)), train(:, 1 : num_features + 1));

% calculate prediction on the test data
prediction = test(:, 1 : num_features + 1) * LR_model.beta;
% transform results to 0-1 interval ("logsig" function can be also used)
prediction = 1 ./ (1 + exp(-prediction));

% evaluate accuracy of prediction
q00 = find(prediction < 0.5 & test(:, num_features + 2) == 0);  % predicted class 0, true class 0
q01 = find(prediction < 0.5 & test(:, num_features + 2) == 1);  % predicted class 0, true class 1
q10 = find(prediction >= 0.5 & test(:, num_features + 2) == 0); % predicted class 1, true class 0
q11 = find(prediction >= 0.5 & test(:, num_features + 2) == 1); % predicted class 1, true class 1

% calculate accuracy of prediction
accuracy = (length(q00) + length(q11)) / (length(q00) + length(q11) + length(q01) + length(q10))

% possibly a better way to calculate accuracy is for both classes
sensitivity = length(q11) / (length(q11) + length(q01)) % accuracy on class 1
specificity = length(q00) / (length(q10) + length(q00)) % accuracy on class 0

% get accuracy of a balanced set
accuracy = (sensitivity + specificity) / 2

% Question: if the accuracy is calculated by the last formula, is there a better way to build a model?
% Consider changing numbers of examples from each class.

%***********************************************************
% 3. Regression Problem - predicting real-numbered outputs
%***********************************************************

clear
clc

D = load('housing.dat');
num_features = size(D, 2) - 1;

% split the data into training, validation and test subsets
% leave 30% of data in test set
[tr_val, test] = divideset(D, 30);     
% assign 30% of tr_val to validation and 70% to training set
[train, val] = divideset(tr_val, 30);                                         

% normalize all the features and target to mean zero and variance 1
[meanv, stdv, train] = normalize(train, [], []);
[meanv, stdv, val] = normalize(val, meanv, stdv);
[meanv, stdv, test] = normalize(test, meanv, stdv);

% initialize neural network
num_hidd = 3;   % choose the number of hidden neurons
% we define neural network with one hidden layer, sigmoid activation functions in the hidden layer,
% linear activation in the output layer. Training is chosen to be a simple backpropagation

method = 2;
switch method
  case 1,    train_alg='traingd'    % Standard Back Propagation 
  case 2,	 train_alg='trainrp'    % Resilient - Back propagation
  case 3,	 train_alg='traingdx'   % Variable Learning Rate
  case 4,	 train_alg='traincgb'   % Powell-Beale Conjugate Gradient
  case 5,	 train_alg='trainlm'    % Levenberg-Marquardt
  case 6,	 train_alg='trainbfg'   % Quasi-Newton Algorithm
  case 7,	 train_alg='trainscg'   % Scaled Conjugate Gradient
  otherwise, train_alg='traingd'
end

net = newff([min(train(:, 1 : num_features)); max(train(:, 1 : num_features))]', ...
      [num_hidd 1], {'logsig', 'purelin'}, train_alg);
net

% define other important parameters for neural network training
% if you omit the learning rate, MATLAB automatically sets it ...
% ... which is better in many cases!
net.trainParam.lr = 0.1;      % choose learning rate
net.trainParam.epochs = 250;  % maximum number of epochs
net.trainParam.show = 10;     % show results each 10 epochs
net.trainParam.max_fail = 5;  % stop if the error on validation set does not improve over 5 iterations

% reshape training, validation and test set to fit neural network training function
P    = train(:, 1 : num_features)';
T    = train(:, num_features + 1)';
VV.P = val(:, 1 : num_features)';
VV.T = val(:, num_features + 1)';
TV.P = test(:, 1 : num_features)';
TV.T = test(:, num_features + 1)';

% train neural network
[net, train] = train(net, P, T, [], [], VV, TV);

% check the predictions on test set
prediction = sim(net, test(:, 1 : num_features)')';

% calculate difference between output and target cariable
pred_error = prediction - test(:, 14);

% plot the histogram of errors (in 20 bins)
hist(pred_error, 20);

% calculate mean square error (MSE)
mse = (pred_error' * pred_error) / length(pred_error)

% calculate R-square value (are we better than a trivial predictor?)
R_square = 1 - mse / std(test(:, 14)) ^ 2

%***********************************************************
% 4. Regression Problem - using PCA
%***********************************************************

% now, try the same problem with PCA analysis
new_dim = 5; % number of new transformed features

% function pca_an does PCA analysis
[pca_data, proj, retvar] = pca_an(D, new_dim, 1);

% total number of attributes in the new matrix pca_data
pca_col = new_dim + 1;

% split the data into training, validation and test subsets (same as before)
[pca_tr_val, pca_test] = divideset(pca_data, 30);
[pca_train, pca_val] = divideset(pca_tr_val, 30);

% normalize all the features and target to mean zero and variance 1
[meanv, stdv, train] = normalize(pca_train, [], []);
[meanv, stdv, val] = normalize(pca_val, meanv, stdv);
[meanv, stdv, test] = normalize(pca_test, meanv, stdv);


% set neural network parameters
num_hidd = 5;
pca_net = newff([min(pca_train(:, 1 : new_dim)); max(pca_train(:, 1 : new_dim))]', ...
  [num_hidd 1], {'logsig', 'purelin'}, train_alg);
pca_net
    
% set parameters, just as above
pca_net.trainParam.lr = 0.1;      % choose learning rate
pca_net.trainParam.epochs = 250;  % maximum number of epochs
pca_net.trainParam.show = 10;     % show results each 10 epochs
pca_net.trainParam.max_fail = 5;  % stop if the error on validation set does not improve over 5 iterations

% reshape training, validation and test set to fit neural network training function
P    = pca_train(:, 1 : new_dim)';
T    = pca_train(:, new_dim + 1)';
VV.P = pca_val(:, 1 : new_dim)';
VV.T = pca_val(:, new_dim + 1)';
TV.P = pca_test(:, 1 : new_dim)';
TV.T = pca_test(:, new_dim + 1)';

% train network
[pca_net, pca_train] = train(pca_net, P, T, [], [], VV, TV);

% check predictions on the test set
pca_prediction = sim(pca_net, pca_test(:, 1 : new_dim)')';

% calculate error with the histogram
pca_error = pca_prediction - pca_test(:, pca_col);
hist(pca_error, 20);

% calculate MSE
pca_mse = (pca_error' * pca_error) / length(pca_error)

% calculate R-square value
pca_R_square = 1 - pca_mse / std(pca_test(:, pca_col)) ^ 2


%***********************************************************
% 5. Classification Problem: Neural Networks
%***********************************************************

clear
clc

D = load('pima.txt');
[pima_row, pima_col] = size(D);
num_features = pima_col - 1;
n_class = 2;

% split the data into training, validation and test subsets
% leave 30% of data in test set
[pima_tr_val, pima_test] = divideset(D, 30);     
% assign 30% of tr_val to validation and 70% to training set
[pima_train, pima_val]   = divideset(pima_tr_val, 30);                                         
    
% normalize all the features and target to mean zero and variance 1
[meanv, stdv, train] = normalize(pima_train, [], []);
[meanv, stdv, val] = normalize(pima_val, meanv, stdv);
[meanv, stdv, test] = normalize(pima_test, meanv, stdv);

% initialize neural network
num_hidd = 5;   % choose the number of hidden neurons (only 1 hidden layer)

method = 5;
switch method
  case 1,    train_alg = 'traingd'    % Standard Back Propagation 
  case 2,	 train_alg = 'trainrp'    % Resilient - Back propagation
  case 3,	 train_alg = 'traingdx'   % Variable LEarning Rate
  case 4,	 train_alg = 'traincgb'   % Powell-Beale Conjugate Gradient
  case 5,	 train_alg = 'trainlm'    % Levenberg-Marquardt
  case 6,	 train_alg = 'trainbfg'   % Quasi-Newton Algorithm
  case 7,	 train_alg = 'trainscg'   % Scaled Conjugate Gradient
  otherwise, train_alg = 'traingd'
end

% observe that we have n_class output nodes now, one for each class
pima_net = newff([min(pima_train(:, 1 : num_features)); max(pima_train(:, 1 : num_features))]',...
      [num_hidd n_class],{'tansig', 'logsig'}, train_alg);
pima_net

% define other important parameters for neural network training
pima_net.trainParam.lr = 0.1;      % choose learning rate
pima_net.trainParam.epochs = 250;  % maximum number of epochs
pima_net.trainParam.show = 10;     % show results each 10 epochs
pima_net.trainParam.max_fail = 5;  % stop if the error on validation set does not improve over 5 iterations

% 'a' - value that shows how the trget class is represened. Everything above
% 0.9 is considered as class 1.
a = 0.9;

% reshape training, validation and test set to fit neural network training function
P    = pima_train(:, 1 : num_features)';
T    = class_matrix_gen1(pima_train(:, pima_col), a, n_class)';
VV.P = pima_val(:, 1 : num_features)';
VV.T = class_matrix_gen1(pima_val(:, pima_col), a, n_class)';
TV.P = pima_test(:, 1 : num_features)';
TV.T = class_matrix_gen1(pima_test(:, pima_col), a, n_class)';

% train network
[pima_net, pima_train] = train(pima_net, P, T, [], [], VV);

% check predictions
pima_pred = sim(pima_net, pima_test(:, 1 : num_features)');

% evaluate accuracy of prediction
q00 = find(pima_pred(1, :) >= pima_pred(2, :) & pima_test(:, num_features + 1)' == 1);  % predicted class 0, true class 0
q01 = find(pima_pred(1, :) >= pima_pred(2, :) & pima_test(:, num_features + 1)' == 2);  % predicted class 0, true class 1
q10 = find(pima_pred(1, :) < pima_pred(2, :) & pima_test(:, num_features + 1)' == 1); % predicted class 1, true class 0
q11 = find(pima_pred(1, :) < pima_pred(2, :) & pima_test(:, num_features + 1)' == 2); % predicted class 1, true class 1

% calculate accuracy of prediction
accuracy = (length(q00) + length(q11)) / (length(q00) + length(q11) + length(q01) + length(q10))

% possibly a better way to calculate accuracy is for both classes
sensitivity = length(q11) / (length(q11) + length(q01)) % accuracy on class 1
specificity = length(q00) / (length(q10) + length(q00)) % accuracy on class 0

% get accuracy of a balanced set
accuracy = (sensitivity + specificity) / 2

%***********************************************************
%***********************************************************
%***********************************************************
% INTRODUCTORY BIOINFORMATICS PROBLEMS
%***********************************************************
%***********************************************************
%***********************************************************

%***********************************************************
% 1. Sequence Alignment
%***********************************************************

% read the fasta files with 3 sequences
[s h] = readFASTA('sequences.txt');
h{1} % display header of the first protein
s{1} % display sequence of the first protein

% read scoring matrix (BLOSUM62 in this case)
S = load('blosum62.txt');

% align sequences 1 and 2
[d seq1 seq2] = protglobal(s{1}, s{2}, S, -1, -11);

% display this alignment
[seq1; seq2]

% calculate and print sequence identity
sequence_identity = length(find(seq1 == seq2)) / max(length(s{1}), length(s{2}));
fprintf(1, '\nSequence identity is %.1f%%\n\n', 100 * sequence_identity);

% align sequences 1 and 3
[d seq1 seq2] = protglobal(s{1}, s{3}, S, -1, -11);

% display alignment
[seq1; seq2]

sequence_identity = length(find(seq1 == seq2)) / max(length(s{1}), length(s{3}));
fprintf(1, '\n\nSequence identity is %.1f%%\n\n', 100 * sequence_identity);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -