📄 mass_spec_demo.m

📁 Recent work by Petricoin and Liotta and co-workers (Petricoin et al. Use of proteomic patterns in se
💻 M
📖 第 1 页 / 共 2 页
字号:
12 下一页
%% Mass Spectrometry Data Analysis Demonstration
%
% Robert Henson and Lucio Cetto, The MathWorks, Inc.
%
% This example demonstrates a number of ways to classify mass spectrometry
% data and shows some statistical tools that can be used to look for
% potential disease markers. The data in this example are from the FDA-NCI
% Clinical Proteomics Program Databank (http://ncifdaproteomics.com). 
%
% There are a number of datasets in the Clinical Proteomics Program
% Databank. This example will use the Ovarian Dataset 8-7-02 that was
% generated using the WCX2 protein array. The sample set includes 91
% controls and 162 ovarian cancers. The data can be downloaded from
% http://ncifdaproteomics.com/download-ovar.php .

%% Importing and visualizing some of the samples
% After downloading and uncompressing the file, the data are stored in
% comma separated value files in two directories, 'Control', and 'Ovarian
% Cancer'. Each file contains two columns, the M/Z values and the intensity
% values corresponding to these mass/charge ratios. 
% The *importdata* function can be used to read in one of the samples:

close all force;clear all;
cd Control
daf_0181 = importdata('Control daf-0181.csv')

%%
% The data values are stored in the data field of the daf_0181 structure.
% The *plot* command creates a graph of the data.

plot(daf_0181.data(:,1),daf_0181.data(:,2))

% The column headers are in the colheaders field. These can be used for the
% X and Y axis labels.
xAxisLabel = daf_0181.colheaders{1};
yAxisLabel = daf_0181.colheaders{2};
xlabel(xAxisLabel);
ylabel(yAxisLabel);

% The default X axis limits are a little loose, these can be made tighter
% using the axis XLim property.
xAxisLimits = [daf_0181.data(1,1),daf_0181.data(end,1)];
set(gca,'xlim',xAxisLimits )

%% 
% The spectra for the cancer samples are in the 'Ovarian Cancer' directory

cd ../'Ovarian Cancer'
daf_0601 = importdata('Ovarian Cancer daf-0601.csv')
hold on
plot(daf_0601.data(:,1),daf_0601.data(:,2),'r')
legend({'Control','Ovarian Cancer'});
hold off

%%
% The area around 7000 to 9500 M/Z has some interesting features. 

set(gca,'xlim',[6500,10000],'ylim',[0,50]);

% There are a couple of peaks that are more pronounced in the cancer sample
% and other peaks that are more pronounced in the cancer sample than the
% control. If you pan along the plot, you will see several such peaks.

%% Importing all the samples
% There are 254 sample files. The *csvread* function is slightly more
% efficient than importdata for reading a large number of similar files.

OC_files = dir('*.csv');

% Preallocate some space for the data.
numOC = numel(OC_files);
numValues = size(daf_0601.data,1);
OC_IN = zeros(numValues,numOC);

% The M/Z values are constant across all the samples.
OC_MZ = daf_0601.data(:,1);

% Loop over the files and read in the data.
for i = 1:numOC
    OC_IN(:,i) = csvread(OC_files(i).name,1,1);
end

%%
% Repeat this for the control data.
cd ../Control
NH_files = dir('*.csv');

% Preallocate some space for the data.
numNH = numel(NH_files);
numValues = size(daf_0181.data,1);
NH_IN = zeros(numValues,numNH);
NH_MZ = daf_0181.data(:,1);

% Loop over the files and read in the data.
for i = 1:numNH
    NH_IN(:,i) = csvread(NH_files(i).name,1,1);
end

cd ..

%% Plotting multiple spectra
% We can plot multipl spectra on the same figure
figure
hNH = plot(NH_MZ,NH_IN(:,1:5) ,'b');
hold on;
hOC = plot(OC_MZ,OC_IN(:,1:5),'r');
set(gca,'xlim',[daf_0181.data(1,1),daf_0181.data(end,1)])
xlabel(xAxisLabel);
ylabel(yAxisLabel);
set(gca,'xlim',xAxisLimits )
legend([hNH(1),hOC(1)],{'Control','Ovarian Cancer'})

%%
% Zooming in on the region from 7000 to 9500 M/Z shows some peaks that may
% be useful for classifying the data. 

set(gca,'xlim',[6500,10000],'ylim',[0,50]);

%% 
% Another way to visualize the whole data set is to look at the average
% signal for the control and cancer samples. We can plot the mean +- one
% standard deviation.

mean_NH = mean(NH_IN,2);
std_NH = std(NH_IN,0,2);
mean_OC = mean(OC_IN,2);
std_OC = std(OC_IN,0,2);

hFig = figure;
hNHm = plot(NH_MZ,mean_NH,'b');
hold on
hOCm = plot(OC_MZ,mean_OC,'r');
plot(NH_MZ,mean_NH+std_NH,'b:')
plot(NH_MZ,mean_NH-std_NH,'b:')
plot(OC_MZ,mean_OC+std_OC,'r:')
plot(OC_MZ,mean_OC-std_OC,'r:')


set(gca,'xlim',[daf_0181.data(1,1),daf_0181.data(end,1)])
xlabel(xAxisLabel);
ylabel(yAxisLabel);
set(gca,'xlim',xAxisLimits )
legend([hNHm,hOCm],{'Control','Ovarian Cancer'})

%% Baseline correction
% Notice that the intensity values are never close to zero. We can correct
% for this in a number of ways. Here we give an example of how to correct
% the baseline for the samples. We use a windowed piecewise cubic
% interpolation method. This does not use the Signal Processing Toolbox but
% there are several functions in the toolbox that give greater flexibility
% in baseline correction and resampling.

D = [NH_IN OC_IN];
ns = size(D,1);         % number of points
nC = size(OC_IN,2);    % number of samples with cancer
nH = size(NH_IN,2);    % number of healty samples
tn = size(D,2);     % total number of samples
w = 75; % window size
temp = zeros(w,ceil(ns/w))+NaN;

for i=1:tn
    temp(1:ns)=D(:,i);
    [m,h]=min(temp);
    g = h>1 & h<w;
    h=w*[0:numel(h)-1]+h;
    m = m(g);
    h = h(g);
    D0(:,i) = [temp(1:ns)-interp1(h,m,1:ns,'pchip')]';
end
figure
plot(NH_MZ,D0(:,1:50:end))
set(gca,'xlim',[daf_0181.data(1,1),daf_0181.data(end,1)])
xlabel(xAxisLabel);
ylabel(yAxisLabel);
set(gca,'xlim',xAxisLimits )

%% Resampling the data
% In this case, the data is already normalized so that all datasets have
% the same m/z values. If this was not the case, MATLAB could easily be
% used to resample the data. Here we show how to resample the data so that
% only integer m/z values are considered.

 nts = floor(max(NH_MZ));
 DR = zeros(nts,tn);
 % resample data using linear interpolation 
 for i=1:tn
     DR(:,i) = interp1(NH_MZ,D(:,i),[1:nts]','linear');
 end

%%
% To see the effect of the resampling, take a look at heatmaps of the
% original spectra and the resampled spectra. In the original data, there
% are many samples at the start.
%%
% Plot a heat map of the original data.
figure;
imagesc([NH_IN,OC_IN])
title('Original Data');
xlabel('Group');ylabel('M/Z');
% display actual m/z value instead of index 
set(gca,'yticklabel',NH_MZ(2000:2000:14000));
% set the xlabels to show the groups
set(gca,'tickdir','out')
set(gca,'xtick',[45 92 172])
set(gca,'xticklabel',{'Control','','Ovarian Cancer'})
colorbar
%%
% Plot a heat map of the resampled data.
figure
imagesc(DR);
title('Resampled Data');
xlabel('Group');ylabel('M/Z');
% set the xlabels to show the groups
set(gca,'tickdir','out')
set(gca,'xtick',[45 92 172])
set(gca,'xticklabel',{'Control','','Ovarian Cancer'})
colorbar
%% Finding significant features
% A naive approach for finding which features in the sample are significant
% is to assume that each M/Z value is independent and do a two-way t-test.

numPoints = numel(NH_MZ);
h = false(numPoints,1);
p = nan+zeros(numPoints,1);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -