📄 mass_spec_demo.m

📁 Recent work by Petricoin and Liotta and co-workers (Petricoin et al. Use of proteomic patterns in se
💻 M
📖 第 1 页 / 共 2 页
字号:
上一页 12
for count = 1:numPoints
 [h(count) p(count)] = ttest2(NH_IN(count,:),OC_IN(count,:),.0001,'both','unequal');
end

% h can be used to extract the significant M/Z values
sig_Masses = NH_MZ(find(h));

%%
% We can plot the p-values over the spectra
figure(hFig);
plot(NH_MZ,-log(p),'g')
% notice that there are significant regions at high m/z values but low
% intensity.

%%
close all; clear DR
%% 
% We could also use the p-value to extract significant points.
sig_Masses = NH_MZ(find(p<1e-6));

%% Classification methods
% There are many classification tools in MATLAB. The Statistics Toolbox
% includes classification trees and discriminant analsysis. We will try a
% number classification techniques. 

% First we calculate some useful values.

D = [NH_IN OC_IN];
ns = size(D,1);         % number of points
nC = size(OC_IN,2);    % number of samples with cancer
nH = size(NH_IN,2);    % number of healty samples
tn = size(D,2);     % total number of samples

% make a indicator vector, where 1s correspond to health samples, 2s to
% ovarian cancer samples.
id = [ones(1,nH) 2*ones(1,nC)];


%% K-Nearest Neighbor classifier
for j=1:3  % run random simulation a few times
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % Select random training and test sets %
    per_train = 0.5;    % percentage of samples for training
    nCt = floor(nC * per_train);   % number of cancer samples in training
    nHt = floor(nH * per_train);   % number of healty samples in training
    nt = nCt+nHt;                  % total number of training samples
    sel_H = randperm(nH);          % randomly select samples for training
    sel_C = nH + randperm(nC);     % randomly select samples for training
    sel_t = [sel_C(1:nCt) sel_H(1:nHt)];    % samples chosen for training   
    sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)];  % samples for evaluation
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % available from the MATLAB Central File Exchange
    c = knnclassify(D(:,sel_e)',D(:,sel_t)',id(sel_t),3,'corr');
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % How well did we do?
    per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100;
    disp(sprintf('KNN Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%% Linear Discriminant Analysis
% Run PCA/LDA classifier - simplified version of "Q5" algorithm proposed by
% Lilien et al in "Probabilistic Disease Classification of
% Expression-Dependent Proteomic Data from Mass Spectrometry of Human
% Serum," (with R. Lilien and H. Farid), Journal of Computational Biology,
% 10(6) 2003, pp. 925-946.
% 

for j=1:3  % run random simulation a few times
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % Select random training and test sets %
    per_train = 0.5;    % percentage of samples for training
    nCt = floor(nC * per_train);   % number of cancer samples in training
    nHt = floor(nH * per_train);   % number of healty samples in training
    nt = nCt+nHt;                  % total number of training samples
    sel_H = randperm(nH);          % randomly select samples for training
    sel_C = nH + randperm(nC);     % randomly select samples for training
    sel_t = [sel_C(1:nCt) sel_H(1:nHt)];    % samples chosen for training
    sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)];  % samples for evaluation
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % select only the significant features.
    ndx = find(p < 1e-6);
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % PCA to reduce dimensionality
    P = princomp(D(ndx,sel_t)','econ');
    % Project into PCA space
    x = D(ndx,:)' * P(:,1:nt-2);
    % Use linear classifier
    c = classify(x(sel_e,:),x(sel_t,:),id(sel_t));
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % How well did we do?
    per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100;
    disp(sprintf('PCA/LDA Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%% Variation on PCA/LDA classifier
% Instead of working with the raw intensity values, we tried running the
% PCA/LDA classifier over the gradient of the resampled data. 
DI=diff(D0);
for j=1:3  % run simulation 10 times
      %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % Select random training and test sets %
    per_train = 0.5;    % percentage of samples for training
    nCt = floor(nC * per_train);   % number of cancer samples in training
    nHt = floor(nH * per_train);   % number of healty samples in training
    nt = nCt+nHt;                  % total number of training samples
    sel_H = randperm(nH);          % randomly select samples for training
    sel_C = nH + randperm(nC);     % randomly select samples for training
    sel_t = [sel_C(1:nCt) sel_H(1:nHt)];    % samples chosen for training
    sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)];  % samples for evaluation
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % This time use an entropy based data reduction method      
    md = mean(DI(:,sel_t(id(sel_t)==2)),2); % mean of healthy samples
    Q = DI - repmat(md,1,tn);               % residuals       
    mc = mean(Q(:,sel_t(id(sel_t)==1)),2);  % residual mean of cancer samples
    sc = std(Q(:,sel_t(id(sel_t)==1)),[],2);%                and also std
    [dump,sel] = sort(-abs(mc./sc));        % metric to reduce samples 
    sel = sel(1:2000);
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % PCA/LDA classifier
    P = princomp(Q(sel,sel_t)','econ');
    x = Q(sel,:)' * P(:,1:nt-3);
    c= classify(x(sel_e,:),x(sel_t,:),id(sel_t));
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % How well did we do?
    per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100;
    disp(sprintf('PCA/LDA Classifier  %d: %.2f%% correct\n',j, per_corr(j)))
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 end
%% Neural Network Classifier 
% Try a simple perceptron
for j = 1:3  % run random simulation a few times
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % Select random training and test sets %
    per_train = 0.5;    % percentage of samples for training
    nCt = floor(nC * per_train);   % number of cancer samples in training
    nHt = floor(nH * per_train);   % number of healty samples in training
    nt = nCt+nHt;                  % total number of training samples
    sel_H = randperm(nH);          % randomly select samples for training
    sel_C = nH + randperm(nC);     % randomly select samples for training
    sel_t = [sel_C(1:nCt) sel_H(1:nHt)];    % samples chosen for training
    sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)];  % samples for evaluation
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % select only the significant features.
    ndx = find(p < 1e-6);
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % PCA to reduce dimensionality
    P = princomp(D(ndx,sel_t)','econ');
    % Project into PCA space
    x = D(ndx,:)' * P(:,1:nt-2);
    % Use Neural Network
    % c = classify(x(sel_e,:),x(sel_t,:),id(sel_t));
    idnn = double(id == 1);
    [norm_data,mindata,maxdata] = premnmx(x');
    range=[mindata,maxdata];
    net=newp(range,1);
    net.trainParam.show = Inf;
    net = train(net,norm_data(:,sel_t),idnn(sel_t));
    apt =sim(net,norm_data(:,sel_e));
    %  [c,ba,ra] = postreg(apt,idnn(sel_e));
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % How well did we do?
    per_corr(j) = (1-sum(abs(apt-idnn(sel_e)))/numel(sel_e))*100;
    disp(sprintf('Perceptron Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end

%%
% Now a more complex network. Feed forward back propagation network with 2
% hidden layers.
for j = 1:3  % run random simulation a few times
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % Select random training and test sets %
    per_train = 0.5;    % percentage of samples for training
    nCt = floor(nC * per_train);   % number of cancer samples in training
    nHt = floor(nH * per_train);   % number of healty samples in training
    nt = nCt+nHt;                  % total number of training samples
    sel_H = randperm(nH);          % randomly select samples for training
    sel_C = nH + randperm(nC);     % randomly select samples for training
    sel_t = [sel_C(1:nCt) sel_H(1:nHt)];    % samples chosen for training
    sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)];  % samples for evaluation
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % select only the significant features.
    ndx = find(p < 1e-6);
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % PCA to reduce dimensionality
    P = princomp(D(ndx,sel_t)','econ');
    % Project into PCA space
    x = D(ndx,:)' * P(:,1:nt-2);
    % Use Neural Network
        
    [norm_data,mindata,maxdata] = premnmx(x');
    range=[mindata,maxdata];
    
    ffnet = newff(range,[9 5 5 2],{'tansig','tansig','tansig','tansig'},'traincgb');
    % set a couple of parameters
    
    ffnet.trainParam.show = inf;  % hide output
    ffnet.trainParam.goal = 0.001;
    ffnet.trainParam.epochs = 500;
    ffnet.trainParam.min_grad = 1e-10;  % can sometimes get stuck at local minima
    % train the network
    % here we use a two ouput tan sigmoidal transfer function so the targe
    idff = [-1 + 2*(id==1);1- 2*(id==1)];
    
    ffnet = train(ffnet,norm_data(:,sel_t),idff(:,sel_t));
      %Test and analyse classifier
    
    aff=sim(ffnet,norm_data(:,sel_e));
    
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % How well did we do?
    per_corr(j) = (1-sum(abs(round(aff(1,:))-idff(1,sel_e)))/numel(sel_e))*100;
    disp(sprintf('Feed-Forward Network Classifier Step %d: %.2f%% correct\n',j, per_corr(j)))
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
end
%% Other ideas
% A SVM classifier using the Optimization Toolbox. Wavelets might also be
% good for feature extraction. To extract the feature information from the
% classifier a bootstrap method or random forest might be way to
% go. Classification trees using the *treefit* function would be easy to
% implement.
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -