📄 matlab_tutorial.m
字号:
A
pos = find(A(:, 2) > 5)
a_found = A(pos, :) % print these rows
% another way to the same thing
a_foud1 = A(find(A(:, 2) > 5), :)
% generate a matrix of random numbers (uniform distribution)
B = rand(5000, 5);
hist(B(:, 1), 20); % histogram of column 1 in 20 bins
% generate a matrix of random numbers (normal dustribution)
B = randn(5000, 5);
hist(B(:, 5), 30); % histogram of column 5 in 30 bins
%***********************************************************
%***********************************************************
%***********************************************************
% DATA MINING PROBLEMS
%***********************************************************
%***********************************************************
%***********************************************************
%***********************************************************
% 1. Preliminaries: Loading Data and Analysis
%***********************************************************
% this is the same as "clear all"
clear
% load the data (13 features, 1 target variable)
D = load('housing.dat');
% get the number of data points (examples) and features
[num_examples, num_cols] = size(D)
num_features = num_cols - 1;
% look at the histogram of the first feature and target variable
hist(D(:, 1), 15); % numerical feature
hist(D(:, 4), 15); % binary feature (categorical, in general)
hist(D(:, 14), 15); % target
% caclulate correlation coefficient between columns 4 and 7
corrcoef(D(:, [4 7]))
% visualize linear correlation between 3 features and a target
plotmatrix(D(:, [4 6 13 14]))
% make a box plot for columns 3 and 6 in the same figure
boxplot(D(:, [3 6]))
%***********************************************************
% 2. Classification Problem: Logistic Regression
%***********************************************************
clear
% get the dataset
D = load('pima.txt');
[pima_row, pima_col] = size(D)
num_features = pima_col - 1
% before prediction, target classes need to be 0-1 (not 1-2 as in the dataset)
pos = find(D(:, size(D, 2)) == 2);
D(pos, size(D, 2)) = 0;
% now, use 30% for the test set
[train, test] = divideset(D, 30);
% normalize training set
[meanv, stdv, train(:, 1 : num_features)] = normalize(train(:, 1 : num_features), [], []);
% normalize test set (notice that 'meanv' and 'stdv' from the training are used to normalize)
[meanv, stdv, test(:, 1 : num_features)] = normalize(test(:, 1 : num_features), meanv, stdv);
% add a column of ones into both training and test sets
train = [train(:, 1 : num_features) ones(size(train, 1), 1) train(:, num_features + 1)];
test = [test(:, 1 : num_features) ones(size(test, 1), 1) test(:, num_features + 1)];
% train logistic regression model
LR_model = logit(train(:, size(train, 2)), train(:, 1 : num_features + 1));
% calculate prediction on the test data
prediction = test(:, 1 : num_features + 1) * LR_model.beta;
% transform results to 0-1 interval ("logsig" function can be also used)
prediction = 1 ./ (1 + exp(-prediction));
% evaluate accuracy of prediction
q00 = find(prediction < 0.5 & test(:, num_features + 2) == 0); % predicted class 0, true class 0
q01 = find(prediction < 0.5 & test(:, num_features + 2) == 1); % predicted class 0, true class 1
q10 = find(prediction >= 0.5 & test(:, num_features + 2) == 0); % predicted class 1, true class 0
q11 = find(prediction >= 0.5 & test(:, num_features + 2) == 1); % predicted class 1, true class 1
% calculate accuracy of prediction
accuracy = (length(q00) + length(q11)) / (length(q00) + length(q11) + length(q01) + length(q10))
% possibly a better way to calculate accuracy is for both classes
sensitivity = length(q11) / (length(q11) + length(q01)) % accuracy on class 1
specificity = length(q00) / (length(q10) + length(q00)) % accuracy on class 0
% get accuracy of a balanced set
accuracy = (sensitivity + specificity) / 2
% Question: if the accuracy is calculated by the last formula, is there a better way to build a model?
% Consider changing numbers of examples from each class.
%***********************************************************
% 3. Regression Problem - predicting real-numbered outputs
%***********************************************************
clear
clc
D = load('housing.dat');
num_features = size(D, 2) - 1;
% split the data into training, validation and test subsets
% leave 30% of data in test set
[tr_val, test] = divideset(D, 30);
% assign 30% of tr_val to validation and 70% to training set
[train, val] = divideset(tr_val, 30);
% normalize all the features and target to mean zero and variance 1
[meanv, stdv, train] = normalize(train, [], []);
[meanv, stdv, val] = normalize(val, meanv, stdv);
[meanv, stdv, test] = normalize(test, meanv, stdv);
% initialize neural network
num_hidd = 3; % choose the number of hidden neurons
% we define neural network with one hidden layer, sigmoid activation functions in the hidden layer,
% linear activation in the output layer. Training is chosen to be a simple backpropagation
method = 2;
switch method
case 1, train_alg='traingd' % Standard Back Propagation
case 2, train_alg='trainrp' % Resilient - Back propagation
case 3, train_alg='traingdx' % Variable Learning Rate
case 4, train_alg='traincgb' % Powell-Beale Conjugate Gradient
case 5, train_alg='trainlm' % Levenberg-Marquardt
case 6, train_alg='trainbfg' % Quasi-Newton Algorithm
case 7, train_alg='trainscg' % Scaled Conjugate Gradient
otherwise, train_alg='traingd'
end
net = newff([min(train(:, 1 : num_features)); max(train(:, 1 : num_features))]', ...
[num_hidd 1], {'logsig', 'purelin'}, train_alg);
net
% define other important parameters for neural network training
% if you omit the learning rate, MATLAB automatically sets it ...
% ... which is better in many cases!
net.trainParam.lr = 0.1; % choose learning rate
net.trainParam.epochs = 250; % maximum number of epochs
net.trainParam.show = 10; % show results each 10 epochs
net.trainParam.max_fail = 5; % stop if the error on validation set does not improve over 5 iterations
% reshape training, validation and test set to fit neural network training function
P = train(:, 1 : num_features)';
T = train(:, num_features + 1)';
VV.P = val(:, 1 : num_features)';
VV.T = val(:, num_features + 1)';
TV.P = test(:, 1 : num_features)';
TV.T = test(:, num_features + 1)';
% train neural network
[net, train] = train(net, P, T, [], [], VV, TV);
% check the predictions on test set
prediction = sim(net, test(:, 1 : num_features)')';
% calculate difference between output and target cariable
pred_error = prediction - test(:, 14);
% plot the histogram of errors (in 20 bins)
hist(pred_error, 20);
% calculate mean square error (MSE)
mse = (pred_error' * pred_error) / length(pred_error)
% calculate R-square value (are we better than a trivial predictor?)
R_square = 1 - mse / std(test(:, 14)) ^ 2
%***********************************************************
% 4. Regression Problem - using PCA
%***********************************************************
% now, try the same problem with PCA analysis
new_dim = 5; % number of new transformed features
% function pca_an does PCA analysis
[pca_data, proj, retvar] = pca_an(D, new_dim, 1);
% total number of attributes in the new matrix pca_data
pca_col = new_dim + 1;
% split the data into training, validation and test subsets (same as before)
[pca_tr_val, pca_test] = divideset(pca_data, 30);
[pca_train, pca_val] = divideset(pca_tr_val, 30);
% normalize all the features and target to mean zero and variance 1
[meanv, stdv, train] = normalize(pca_train, [], []);
[meanv, stdv, val] = normalize(pca_val, meanv, stdv);
[meanv, stdv, test] = normalize(pca_test, meanv, stdv);
% set neural network parameters
num_hidd = 5;
pca_net = newff([min(pca_train(:, 1 : new_dim)); max(pca_train(:, 1 : new_dim))]', ...
[num_hidd 1], {'logsig', 'purelin'}, train_alg);
pca_net
% set parameters, just as above
pca_net.trainParam.lr = 0.1; % choose learning rate
pca_net.trainParam.epochs = 250; % maximum number of epochs
pca_net.trainParam.show = 10; % show results each 10 epochs
pca_net.trainParam.max_fail = 5; % stop if the error on validation set does not improve over 5 iterations
% reshape training, validation and test set to fit neural network training function
P = pca_train(:, 1 : new_dim)';
T = pca_train(:, new_dim + 1)';
VV.P = pca_val(:, 1 : new_dim)';
VV.T = pca_val(:, new_dim + 1)';
TV.P = pca_test(:, 1 : new_dim)';
TV.T = pca_test(:, new_dim + 1)';
% train network
[pca_net, pca_train] = train(pca_net, P, T, [], [], VV, TV);
% check predictions on the test set
pca_prediction = sim(pca_net, pca_test(:, 1 : new_dim)')';
% calculate error with the histogram
pca_error = pca_prediction - pca_test(:, pca_col);
hist(pca_error, 20);
% calculate MSE
pca_mse = (pca_error' * pca_error) / length(pca_error)
% calculate R-square value
pca_R_square = 1 - pca_mse / std(pca_test(:, pca_col)) ^ 2
%***********************************************************
% 5. Classification Problem: Neural Networks
%***********************************************************
clear
clc
D = load('pima.txt');
[pima_row, pima_col] = size(D);
num_features = pima_col - 1;
n_class = 2;
% split the data into training, validation and test subsets
% leave 30% of data in test set
[pima_tr_val, pima_test] = divideset(D, 30);
% assign 30% of tr_val to validation and 70% to training set
[pima_train, pima_val] = divideset(pima_tr_val, 30);
% normalize all the features and target to mean zero and variance 1
[meanv, stdv, train] = normalize(pima_train, [], []);
[meanv, stdv, val] = normalize(pima_val, meanv, stdv);
[meanv, stdv, test] = normalize(pima_test, meanv, stdv);
% initialize neural network
num_hidd = 5; % choose the number of hidden neurons (only 1 hidden layer)
method = 5;
switch method
case 1, train_alg = 'traingd' % Standard Back Propagation
case 2, train_alg = 'trainrp' % Resilient - Back propagation
case 3, train_alg = 'traingdx' % Variable LEarning Rate
case 4, train_alg = 'traincgb' % Powell-Beale Conjugate Gradient
case 5, train_alg = 'trainlm' % Levenberg-Marquardt
case 6, train_alg = 'trainbfg' % Quasi-Newton Algorithm
case 7, train_alg = 'trainscg' % Scaled Conjugate Gradient
otherwise, train_alg = 'traingd'
end
% observe that we have n_class output nodes now, one for each class
pima_net = newff([min(pima_train(:, 1 : num_features)); max(pima_train(:, 1 : num_features))]',...
[num_hidd n_class],{'tansig', 'logsig'}, train_alg);
pima_net
% define other important parameters for neural network training
pima_net.trainParam.lr = 0.1; % choose learning rate
pima_net.trainParam.epochs = 250; % maximum number of epochs
pima_net.trainParam.show = 10; % show results each 10 epochs
pima_net.trainParam.max_fail = 5; % stop if the error on validation set does not improve over 5 iterations
% 'a' - value that shows how the trget class is represened. Everything above
% 0.9 is considered as class 1.
a = 0.9;
% reshape training, validation and test set to fit neural network training function
P = pima_train(:, 1 : num_features)';
T = class_matrix_gen1(pima_train(:, pima_col), a, n_class)';
VV.P = pima_val(:, 1 : num_features)';
VV.T = class_matrix_gen1(pima_val(:, pima_col), a, n_class)';
TV.P = pima_test(:, 1 : num_features)';
TV.T = class_matrix_gen1(pima_test(:, pima_col), a, n_class)';
% train network
[pima_net, pima_train] = train(pima_net, P, T, [], [], VV);
% check predictions
pima_pred = sim(pima_net, pima_test(:, 1 : num_features)');
% evaluate accuracy of prediction
q00 = find(pima_pred(1, :) >= pima_pred(2, :) & pima_test(:, num_features + 1)' == 1); % predicted class 0, true class 0
q01 = find(pima_pred(1, :) >= pima_pred(2, :) & pima_test(:, num_features + 1)' == 2); % predicted class 0, true class 1
q10 = find(pima_pred(1, :) < pima_pred(2, :) & pima_test(:, num_features + 1)' == 1); % predicted class 1, true class 0
q11 = find(pima_pred(1, :) < pima_pred(2, :) & pima_test(:, num_features + 1)' == 2); % predicted class 1, true class 1
% calculate accuracy of prediction
accuracy = (length(q00) + length(q11)) / (length(q00) + length(q11) + length(q01) + length(q10))
% possibly a better way to calculate accuracy is for both classes
sensitivity = length(q11) / (length(q11) + length(q01)) % accuracy on class 1
specificity = length(q00) / (length(q10) + length(q00)) % accuracy on class 0
% get accuracy of a balanced set
accuracy = (sensitivity + specificity) / 2
%***********************************************************
%***********************************************************
%***********************************************************
% INTRODUCTORY BIOINFORMATICS PROBLEMS
%***********************************************************
%***********************************************************
%***********************************************************
%***********************************************************
% 1. Sequence Alignment
%***********************************************************
% read the fasta files with 3 sequences
[s h] = readFASTA('sequences.txt');
h{1} % display header of the first protein
s{1} % display sequence of the first protein
% read scoring matrix (BLOSUM62 in this case)
S = load('blosum62.txt');
% align sequences 1 and 2
[d seq1 seq2] = protglobal(s{1}, s{2}, S, -1, -11);
% display this alignment
[seq1; seq2]
% calculate and print sequence identity
sequence_identity = length(find(seq1 == seq2)) / max(length(s{1}), length(s{2}));
fprintf(1, '\nSequence identity is %.1f%%\n\n', 100 * sequence_identity);
% align sequences 1 and 3
[d seq1 seq2] = protglobal(s{1}, s{3}, S, -1, -11);
% display alignment
[seq1; seq2]
sequence_identity = length(find(seq1 == seq2)) / max(length(s{1}), length(s{3}));
fprintf(1, '\n\nSequence identity is %.1f%%\n\n', 100 * sequence_identity);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -