📄 mass_spec_demo.m
字号:
for count = 1:numPoints
[h(count) p(count)] = ttest2(NH_IN(count,:),OC_IN(count,:),.0001,'both','unequal');
end
% h can be used to extract the significant M/Z values
sig_Masses = NH_MZ(find(h));
%%
% We can plot the p-values over the spectra
figure(hFig);
plot(NH_MZ,-log(p),'g')
% notice that there are significant regions at high m/z values but low
% intensity.
%%
close all; clear DR
%%
% We could also use the p-value to extract significant points.
sig_Masses = NH_MZ(find(p<1e-6));
%% Classification methods
% There are many classification tools in MATLAB. The Statistics Toolbox
% includes classification trees and discriminant analsysis. We will try a
% number classification techniques.
% First we calculate some useful values.
D = [NH_IN OC_IN];
ns = size(D,1); % number of points
nC = size(OC_IN,2); % number of samples with cancer
nH = size(NH_IN,2); % number of healty samples
tn = size(D,2); % total number of samples
% make a indicator vector, where 1s correspond to health samples, 2s to
% ovarian cancer samples.
id = [ones(1,nH) 2*ones(1,nC)];
%% K-Nearest Neighbor classifier
for j=1:3 % run random simulation a few times
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Select random training and test sets %
per_train = 0.5; % percentage of samples for training
nCt = floor(nC * per_train); % number of cancer samples in training
nHt = floor(nH * per_train); % number of healty samples in training
nt = nCt+nHt; % total number of training samples
sel_H = randperm(nH); % randomly select samples for training
sel_C = nH + randperm(nC); % randomly select samples for training
sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training
sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% available from the MATLAB Central File Exchange
c = knnclassify(D(:,sel_e)',D(:,sel_t)',id(sel_t),3,'corr');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% How well did we do?
per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100;
disp(sprintf('KNN Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%% Linear Discriminant Analysis
% Run PCA/LDA classifier - simplified version of "Q5" algorithm proposed by
% Lilien et al in "Probabilistic Disease Classification of
% Expression-Dependent Proteomic Data from Mass Spectrometry of Human
% Serum," (with R. Lilien and H. Farid), Journal of Computational Biology,
% 10(6) 2003, pp. 925-946.
%
for j=1:3 % run random simulation a few times
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Select random training and test sets %
per_train = 0.5; % percentage of samples for training
nCt = floor(nC * per_train); % number of cancer samples in training
nHt = floor(nH * per_train); % number of healty samples in training
nt = nCt+nHt; % total number of training samples
sel_H = randperm(nH); % randomly select samples for training
sel_C = nH + randperm(nC); % randomly select samples for training
sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training
sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% select only the significant features.
ndx = find(p < 1e-6);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PCA to reduce dimensionality
P = princomp(D(ndx,sel_t)','econ');
% Project into PCA space
x = D(ndx,:)' * P(:,1:nt-2);
% Use linear classifier
c = classify(x(sel_e,:),x(sel_t,:),id(sel_t));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% How well did we do?
per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100;
disp(sprintf('PCA/LDA Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%% Variation on PCA/LDA classifier
% Instead of working with the raw intensity values, we tried running the
% PCA/LDA classifier over the gradient of the resampled data.
DI=diff(D0);
for j=1:3 % run simulation 10 times
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Select random training and test sets %
per_train = 0.5; % percentage of samples for training
nCt = floor(nC * per_train); % number of cancer samples in training
nHt = floor(nH * per_train); % number of healty samples in training
nt = nCt+nHt; % total number of training samples
sel_H = randperm(nH); % randomly select samples for training
sel_C = nH + randperm(nC); % randomly select samples for training
sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training
sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This time use an entropy based data reduction method
md = mean(DI(:,sel_t(id(sel_t)==2)),2); % mean of healthy samples
Q = DI - repmat(md,1,tn); % residuals
mc = mean(Q(:,sel_t(id(sel_t)==1)),2); % residual mean of cancer samples
sc = std(Q(:,sel_t(id(sel_t)==1)),[],2);% and also std
[dump,sel] = sort(-abs(mc./sc)); % metric to reduce samples
sel = sel(1:2000);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PCA/LDA classifier
P = princomp(Q(sel,sel_t)','econ');
x = Q(sel,:)' * P(:,1:nt-3);
c= classify(x(sel_e,:),x(sel_t,:),id(sel_t));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% How well did we do?
per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100;
disp(sprintf('PCA/LDA Classifier %d: %.2f%% correct\n',j, per_corr(j)))
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%% Neural Network Classifier
% Try a simple perceptron
for j = 1:3 % run random simulation a few times
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Select random training and test sets %
per_train = 0.5; % percentage of samples for training
nCt = floor(nC * per_train); % number of cancer samples in training
nHt = floor(nH * per_train); % number of healty samples in training
nt = nCt+nHt; % total number of training samples
sel_H = randperm(nH); % randomly select samples for training
sel_C = nH + randperm(nC); % randomly select samples for training
sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training
sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% select only the significant features.
ndx = find(p < 1e-6);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PCA to reduce dimensionality
P = princomp(D(ndx,sel_t)','econ');
% Project into PCA space
x = D(ndx,:)' * P(:,1:nt-2);
% Use Neural Network
% c = classify(x(sel_e,:),x(sel_t,:),id(sel_t));
idnn = double(id == 1);
[norm_data,mindata,maxdata] = premnmx(x');
range=[mindata,maxdata];
net=newp(range,1);
net.trainParam.show = Inf;
net = train(net,norm_data(:,sel_t),idnn(sel_t));
apt =sim(net,norm_data(:,sel_e));
% [c,ba,ra] = postreg(apt,idnn(sel_e));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% How well did we do?
per_corr(j) = (1-sum(abs(apt-idnn(sel_e)))/numel(sel_e))*100;
disp(sprintf('Perceptron Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%%
% Now a more complex network. Feed forward back propagation network with 2
% hidden layers.
for j = 1:3 % run random simulation a few times
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Select random training and test sets %
per_train = 0.5; % percentage of samples for training
nCt = floor(nC * per_train); % number of cancer samples in training
nHt = floor(nH * per_train); % number of healty samples in training
nt = nCt+nHt; % total number of training samples
sel_H = randperm(nH); % randomly select samples for training
sel_C = nH + randperm(nC); % randomly select samples for training
sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training
sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% select only the significant features.
ndx = find(p < 1e-6);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PCA to reduce dimensionality
P = princomp(D(ndx,sel_t)','econ');
% Project into PCA space
x = D(ndx,:)' * P(:,1:nt-2);
% Use Neural Network
[norm_data,mindata,maxdata] = premnmx(x');
range=[mindata,maxdata];
ffnet = newff(range,[9 5 5 2],{'tansig','tansig','tansig','tansig'},'traincgb');
% set a couple of parameters
ffnet.trainParam.show = inf; % hide output
ffnet.trainParam.goal = 0.001;
ffnet.trainParam.epochs = 500;
ffnet.trainParam.min_grad = 1e-10; % can sometimes get stuck at local minima
% train the network
% here we use a two ouput tan sigmoidal transfer function so the targe
idff = [-1 + 2*(id==1);1- 2*(id==1)];
ffnet = train(ffnet,norm_data(:,sel_t),idff(:,sel_t));
%Test and analyse classifier
aff=sim(ffnet,norm_data(:,sel_e));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% How well did we do?
per_corr(j) = (1-sum(abs(round(aff(1,:))-idff(1,sel_e)))/numel(sel_e))*100;
disp(sprintf('Feed-Forward Network Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%% Other ideas
% A SVM classifier using the Optimization Toolbox. Wavelets might also be
% good for feature extraction. To extract the feature information from the
% classifier a bootstrap method or random forest might be way to
% go. Classification trees using the *treefit* function would be easy to
% implement.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -