📄 mass_spec_demo.m
字号:
%% Mass Spectrometry Data Analysis Demonstration
%
% Robert Henson and Lucio Cetto, The MathWorks, Inc.
%
% This example demonstrates a number of ways to classify mass spectrometry
% data and shows some statistical tools that can be used to look for
% potential disease markers. The data in this example are from the FDA-NCI
% Clinical Proteomics Program Databank (http://ncifdaproteomics.com).
%
% There are a number of datasets in the Clinical Proteomics Program
% Databank. This example will use the Ovarian Dataset 8-7-02 that was
% generated using the WCX2 protein array. The sample set includes 91
% controls and 162 ovarian cancers. The data can be downloaded from
% http://ncifdaproteomics.com/download-ovar.php .
%% Importing and visualizing some of the samples
% After downloading and uncompressing the file, the data are stored in
% comma separated value files in two directories, 'Control', and 'Ovarian
% Cancer'. Each file contains two columns, the M/Z values and the intensity
% values corresponding to these mass/charge ratios.
% The *importdata* function can be used to read in one of the samples:
close all force;clear all;
cd Control
daf_0181 = importdata('Control daf-0181.csv')
%%
% The data values are stored in the data field of the daf_0181 structure.
% The *plot* command creates a graph of the data.
plot(daf_0181.data(:,1),daf_0181.data(:,2))
% The column headers are in the colheaders field. These can be used for the
% X and Y axis labels.
xAxisLabel = daf_0181.colheaders{1};
yAxisLabel = daf_0181.colheaders{2};
xlabel(xAxisLabel);
ylabel(yAxisLabel);
% The default X axis limits are a little loose, these can be made tighter
% using the axis XLim property.
xAxisLimits = [daf_0181.data(1,1),daf_0181.data(end,1)];
set(gca,'xlim',xAxisLimits )
%%
% The spectra for the cancer samples are in the 'Ovarian Cancer' directory
cd ../'Ovarian Cancer'
daf_0601 = importdata('Ovarian Cancer daf-0601.csv')
hold on
plot(daf_0601.data(:,1),daf_0601.data(:,2),'r')
legend({'Control','Ovarian Cancer'});
hold off
%%
% The area around 7000 to 9500 M/Z has some interesting features.
set(gca,'xlim',[6500,10000],'ylim',[0,50]);
% There are a couple of peaks that are more pronounced in the cancer sample
% and other peaks that are more pronounced in the cancer sample than the
% control. If you pan along the plot, you will see several such peaks.
%% Importing all the samples
% There are 254 sample files. The *csvread* function is slightly more
% efficient than importdata for reading a large number of similar files.
OC_files = dir('*.csv');
% Preallocate some space for the data.
numOC = numel(OC_files);
numValues = size(daf_0601.data,1);
OC_IN = zeros(numValues,numOC);
% The M/Z values are constant across all the samples.
OC_MZ = daf_0601.data(:,1);
% Loop over the files and read in the data.
for i = 1:numOC
OC_IN(:,i) = csvread(OC_files(i).name,1,1);
end
%%
% Repeat this for the control data.
cd ../Control
NH_files = dir('*.csv');
% Preallocate some space for the data.
numNH = numel(NH_files);
numValues = size(daf_0181.data,1);
NH_IN = zeros(numValues,numNH);
NH_MZ = daf_0181.data(:,1);
% Loop over the files and read in the data.
for i = 1:numNH
NH_IN(:,i) = csvread(NH_files(i).name,1,1);
end
cd ..
%% Plotting multiple spectra
% We can plot multipl spectra on the same figure
figure
hNH = plot(NH_MZ,NH_IN(:,1:5) ,'b');
hold on;
hOC = plot(OC_MZ,OC_IN(:,1:5),'r');
set(gca,'xlim',[daf_0181.data(1,1),daf_0181.data(end,1)])
xlabel(xAxisLabel);
ylabel(yAxisLabel);
set(gca,'xlim',xAxisLimits )
legend([hNH(1),hOC(1)],{'Control','Ovarian Cancer'})
%%
% Zooming in on the region from 7000 to 9500 M/Z shows some peaks that may
% be useful for classifying the data.
set(gca,'xlim',[6500,10000],'ylim',[0,50]);
%%
% Another way to visualize the whole data set is to look at the average
% signal for the control and cancer samples. We can plot the mean +- one
% standard deviation.
mean_NH = mean(NH_IN,2);
std_NH = std(NH_IN,0,2);
mean_OC = mean(OC_IN,2);
std_OC = std(OC_IN,0,2);
hFig = figure;
hNHm = plot(NH_MZ,mean_NH,'b');
hold on
hOCm = plot(OC_MZ,mean_OC,'r');
plot(NH_MZ,mean_NH+std_NH,'b:')
plot(NH_MZ,mean_NH-std_NH,'b:')
plot(OC_MZ,mean_OC+std_OC,'r:')
plot(OC_MZ,mean_OC-std_OC,'r:')
set(gca,'xlim',[daf_0181.data(1,1),daf_0181.data(end,1)])
xlabel(xAxisLabel);
ylabel(yAxisLabel);
set(gca,'xlim',xAxisLimits )
legend([hNHm,hOCm],{'Control','Ovarian Cancer'})
%% Baseline correction
% Notice that the intensity values are never close to zero. We can correct
% for this in a number of ways. Here we give an example of how to correct
% the baseline for the samples. We use a windowed piecewise cubic
% interpolation method. This does not use the Signal Processing Toolbox but
% there are several functions in the toolbox that give greater flexibility
% in baseline correction and resampling.
D = [NH_IN OC_IN];
ns = size(D,1); % number of points
nC = size(OC_IN,2); % number of samples with cancer
nH = size(NH_IN,2); % number of healty samples
tn = size(D,2); % total number of samples
w = 75; % window size
temp = zeros(w,ceil(ns/w))+NaN;
for i=1:tn
temp(1:ns)=D(:,i);
[m,h]=min(temp);
g = h>1 & h<w;
h=w*[0:numel(h)-1]+h;
m = m(g);
h = h(g);
D0(:,i) = [temp(1:ns)-interp1(h,m,1:ns,'pchip')]';
end
figure
plot(NH_MZ,D0(:,1:50:end))
set(gca,'xlim',[daf_0181.data(1,1),daf_0181.data(end,1)])
xlabel(xAxisLabel);
ylabel(yAxisLabel);
set(gca,'xlim',xAxisLimits )
%% Resampling the data
% In this case, the data is already normalized so that all datasets have
% the same m/z values. If this was not the case, MATLAB could easily be
% used to resample the data. Here we show how to resample the data so that
% only integer m/z values are considered.
nts = floor(max(NH_MZ));
DR = zeros(nts,tn);
% resample data using linear interpolation
for i=1:tn
DR(:,i) = interp1(NH_MZ,D(:,i),[1:nts]','linear');
end
%%
% To see the effect of the resampling, take a look at heatmaps of the
% original spectra and the resampled spectra. In the original data, there
% are many samples at the start.
%%
% Plot a heat map of the original data.
figure;
imagesc([NH_IN,OC_IN])
title('Original Data');
xlabel('Group');ylabel('M/Z');
% display actual m/z value instead of index
set(gca,'yticklabel',NH_MZ(2000:2000:14000));
% set the xlabels to show the groups
set(gca,'tickdir','out')
set(gca,'xtick',[45 92 172])
set(gca,'xticklabel',{'Control','','Ovarian Cancer'})
colorbar
%%
% Plot a heat map of the resampled data.
figure
imagesc(DR);
title('Resampled Data');
xlabel('Group');ylabel('M/Z');
% set the xlabels to show the groups
set(gca,'tickdir','out')
set(gca,'xtick',[45 92 172])
set(gca,'xticklabel',{'Control','','Ovarian Cancer'})
colorbar
%% Finding significant features
% A naive approach for finding which features in the sample are significant
% is to assume that each M/Z value is independent and do a two-way t-test.
numPoints = numel(NH_MZ);
h = false(numPoints,1);
p = nan+zeros(numPoints,1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -