📄 mass_spec_demo.html
字号:
set(gca,'xticklabel',{'Control','','Ovarian Cancer'})colorbar%% Finding significant features% A naive approach for finding which features in the sample are significant% is to assume that each M/Z value is independent and do a two-way t-test.numPoints = numel(NH_MZ);h = false(numPoints,1);p = nan+zeros(numPoints,1);for count = 1:numPoints [h(count) p(count)] = ttest2(NH_IN(count,:),OC_IN(count,:),.0001,'both','unequal');end% h can be used to extract the significant M/Z valuessig_Masses = NH_MZ(find(h));%%% We can plot the p-values over the spectrafigure(hFig);plot(NH_MZ,-log(p),'g')% notice that there are significant regions at high m/z values but low% intensity.%%close all; clear DR%% % We could also use the p-value to extract significant points.sig_Masses = NH_MZ(find(p<1e-6));%% Classification methods% There are many classification tools in MATLAB. The Statistics Toolbox% includes classification trees and discriminant analsysis. We will try a% number classification techniques. % First we calculate some useful values.D = [NH_IN OC_IN];ns = size(D,1); % number of pointsnC = size(OC_IN,2); % number of samples with cancernH = size(NH_IN,2); % number of healty samplestn = size(D,2); % total number of samples% make a indicator vector, where 1s correspond to health samples, 2s to% ovarian cancer samples.id = [ones(1,nH) 2*ones(1,nC)];%% K-Nearest Neighbor classifierfor j=1:3 % run random simulation a few times %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Select random training and test sets % per_train = 0.5; % percentage of samples for training nCt = floor(nC * per_train); % number of cancer samples in training nHt = floor(nH * per_train); % number of healty samples in training nt = nCt+nHt; % total number of training samples sel_H = randperm(nH); % randomly select samples for training sel_C = nH + randperm(nC); % randomly select samples for training sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % available from the MATLAB Central File Exchange c = knnclassify(D(:,sel_e)',D(:,sel_t)',id(sel_t),3,'corr'); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % How well did we do? per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100; disp(sprintf('KNN Classifier Step %d: %.2f%% correct\n',j, per_corr(j))) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%end%% Linear Discriminant Analysis% Run PCA/LDA classifier - simplified version of "Q5" algorithm proposed by% Lilien et al in "Probabilistic Disease Classification of% Expression-Dependent Proteomic Data from Mass Spectrometry of Human% Serum," (with R. Lilien and H. Farid), Journal of Computational Biology,% 10(6) 2003, pp. 925-946.% for j=1:3 % run random simulation a few times %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Select random training and test sets % per_train = 0.5; % percentage of samples for training nCt = floor(nC * per_train); % number of cancer samples in training nHt = floor(nH * per_train); % number of healty samples in training nt = nCt+nHt; % total number of training samples sel_H = randperm(nH); % randomly select samples for training sel_C = nH + randperm(nC); % randomly select samples for training sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % select only the significant features. ndx = find(p < 1e-6); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % PCA to reduce dimensionality P = princomp(D(ndx,sel_t)','econ'); % Project into PCA space x = D(ndx,:)' * P(:,1:nt-2); % Use linear classifier c = classify(x(sel_e,:),x(sel_t,:),id(sel_t)); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % How well did we do? per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100; disp(sprintf('PCA/LDA Classifier Step %d: %.2f%% correct\n',j, per_corr(j))) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%end%% Variation on PCA/LDA classifier% Instead of working with the raw intensity values, we tried running the% PCA/LDA classifier over the gradient of the resampled data. DI=diff(D0);for j=1:3 % run simulation 10 times %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Select random training and test sets % per_train = 0.5; % percentage of samples for training nCt = floor(nC * per_train); % number of cancer samples in training nHt = floor(nH * per_train); % number of healty samples in training nt = nCt+nHt; % total number of training samples sel_H = randperm(nH); % randomly select samples for training sel_C = nH + randperm(nC); % randomly select samples for training sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % This time use an entropy based data reduction method md = mean(DI(:,sel_t(id(sel_t)==2)),2); % mean of healthy samples Q = DI - repmat(md,1,tn); % residuals mc = mean(Q(:,sel_t(id(sel_t)==1)),2); % residual mean of cancer samples sc = std(Q(:,sel_t(id(sel_t)==1)),[],2);% and also std [dump,sel] = sort(-abs(mc./sc)); % metric to reduce samples sel = sel(1:2000); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % PCA/LDA classifier P = princomp(Q(sel,sel_t)','econ'); x = Q(sel,:)' * P(:,1:nt-3); c= classify(x(sel_e,:),x(sel_t,:),id(sel_t)); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % How well did we do? per_corr(j) = (1-sum(abs(c-id(sel_e)'))/numel(sel_e))*100; disp(sprintf('PCA/LDA Classifier %d: %.2f%% correct\n',j, per_corr(j))) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% end%% Neural Network Classifier % Try a simple perceptronfor j = 1:3 % run random simulation a few times %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Select random training and test sets % per_train = 0.5; % percentage of samples for training nCt = floor(nC * per_train); % number of cancer samples in training nHt = floor(nH * per_train); % number of healty samples in training nt = nCt+nHt; % total number of training samples sel_H = randperm(nH); % randomly select samples for training sel_C = nH + randperm(nC); % randomly select samples for training sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % select only the significant features. ndx = find(p < 1e-6); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % PCA to reduce dimensionality P = princomp(D(ndx,sel_t)','econ'); % Project into PCA space x = D(ndx,:)' * P(:,1:nt-2); % Use Neural Network % c = classify(x(sel_e,:),x(sel_t,:),id(sel_t)); idnn = double(id == 1); [norm_data,mindata,maxdata] = premnmx(x'); range=[mindata,maxdata]; net=newp(range,1); net.trainParam.show = Inf; net = train(net,norm_data(:,sel_t),idnn(sel_t)); apt =sim(net,norm_data(:,sel_e)); % [c,ba,ra] = postreg(apt,idnn(sel_e)); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % How well did we do? per_corr(j) = (1-sum(abs(apt-idnn(sel_e)))/numel(sel_e))*100; disp(sprintf('Perceptron Classifier Step %d: %.2f%% correct\n',j, per_corr(j))) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%end%%% Now a more complex network. Feed forward back propagation network with 2% hidden layers.for j = 1:3 % run random simulation a few times %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Select random training and test sets % per_train = 0.5; % percentage of samples for training nCt = floor(nC * per_train); % number of cancer samples in training nHt = floor(nH * per_train); % number of healty samples in training nt = nCt+nHt; % total number of training samples sel_H = randperm(nH); % randomly select samples for training sel_C = nH + randperm(nC); % randomly select samples for training sel_t = [sel_C(1:nCt) sel_H(1:nHt)]; % samples chosen for training sel_e = [sel_C(nCt+1:end) sel_H(nHt+1:end)]; % samples for evaluation %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % select only the significant features. ndx = find(p < 1e-6); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % PCA to reduce dimensionality P = princomp(D(ndx,sel_t)','econ'); % Project into PCA space x = D(ndx,:)' * P(:,1:nt-2); % Use Neural Network [norm_data,mindata,maxdata] = premnmx(x'); range=[mindata,maxdata]; ffnet = newff(range,[9 5 5 2],{'tansig','tansig','tansig','tansig'},'traincgb'); % set a couple of parameters ffnet.trainParam.show = inf; % hide output ffnet.trainParam.goal = 0.001; ffnet.trainParam.epochs = 500; ffnet.trainParam.min_grad = 1e-10; % can sometimes get stuck at local minima % train the network % here we use a two ouput tan sigmoidal transfer function so the targe idff = [-1 + 2*(id==1);1- 2*(id==1)]; ffnet = train(ffnet,norm_data(:,sel_t),idff(:,sel_t)); %Test and analyse classifier aff=sim(ffnet,norm_data(:,sel_e)); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % How well did we do? per_corr(j) = (1-sum(abs(round(aff(1,:))-idff(1,sel_e)))/numel(sel_e))*100; disp(sprintf('Feed-Forward Network Classifier Step %d: %.2f%% correct\n',j, per_corr(j))) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%end%% Other ideas% A SVM classifier using the Optimization Toolbox. Wavelets might also be% good for feature extraction. To extract the feature information from the% classifier a bootstrap method or random forest might be way to% go. Classification trees using the *treefit* function would be easy to% implement. ##### SOURCE END #####--> </body></html>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -