📄 qcbasicspecreader.cpp
字号:
#include "QuickClustering.h"
#include "Isotopes.h"
int BasicSpecReader::read_basic_spec(Config* config,
const FileManager& fm,
SingleSpectrumFile *ssf,
QCPeak* peaks,
bool override_file_idx,
bool no_processing)
{
const mass_t tolerance = config->get_tolerance();
int i,num_peaks = -1;
if (ssf->num_peaks > peak_list.size())
{
max_peak_list_size = ssf->num_peaks * 2;
if (max_peak_list_size<2000)
max_peak_list_size = 2000;
peak_list.resize(max_peak_list_size);
}
// cout << ssf->file_idx << " " << ssf->single_name << " NUM PEAKS: " << ssf->num_peaks << endl;
if (ssf->type == DTA)
{
num_peaks = get_peak_list_from_DTA(ssf->single_name.c_str());
if (num_peaks != ssf->num_peaks)
{
cout << "Heh heh ... there is an error!!!" << endl;
exit(1);
}
}
else if (ssf->type == MGF)
{
if (this->current_mgf_file_idx != ssf->file_idx)
{
if (mgf_stream)
{
fclose(mgf_stream);
}
int file_idx = ssf->file_idx;
if (override_file_idx)
file_idx = 0;
const string& fname = fm.get_mgf_file(file_idx).mgf_name.c_str();
mgf_stream=fopen(fname.c_str(),"r");
if (! mgf_stream)
{
cout << "Error: couldn't open mgf: " << fname << endl;
exit(1);
}
this->current_mgf_file_idx = ssf->file_idx;
}
if ( fseek(mgf_stream,ssf->file_pos,SEEK_SET) )
{
cout << "Error: could not skip in file!" << endl;
exit(1);
}
num_peaks = get_peak_list_from_MGF(mgf_stream);
MGF_single *mgf_ssf = (MGF_single *)ssf;
if (num_peaks>0 && peak_list[0].mass != mgf_ssf->first_peak_mass)
{
mgf_ssf->print_ssf_stats(config);
cout << "Error: mismatch in first peak masses: " << setprecision(5) << mgf_ssf->first_peak_mass << " " << peak_list[0].mass << endl;
cout << "This error could arise because of Unix/Windows issues." << endl;
cout << "Try running unix2dos on the input files." << endl;
exit(1);
}
}
else if (ssf->type == MZXML)
{
// overide if the mzXML was read entirely into the MZXML_file
// via peak buff. Should know if this happend because
MZXML_single *mzxml_ssf = (MZXML_single *)ssf;
if (mzxml_ssf->peak_buff_start_idx>=0)
{
int i;
float *ptr;
fm.copy_mzxml_peak_buff_ptr(&ptr);
float *mzxml_peak_buff = ptr + mzxml_ssf->peak_buff_start_idx;
int idx=0;
for (i=0; i<mzxml_ssf->num_peaks; i++)
{
peaks[i].mass =mzxml_peak_buff[idx++];
peaks[i].intensity=mzxml_peak_buff[idx++];
}
// peaks already filtered
return mzxml_ssf->num_peaks;
}
else
{
if (this->current_mzxml_file_idx != ssf->file_idx)
{
if (mzxml_stream)
{
fclose(mzxml_stream);
}
int file_idx = ssf->file_idx;
if (override_file_idx)
file_idx = 0;
const string& fname = fm.get_mzxml_file(file_idx).mzxml_name.c_str();
mzxml_stream=fopen(fname.c_str(),"r");
if (! mzxml_stream)
{
cout << "Error: couldn't open mzxml: " << fname << endl;
exit(1);
}
this->current_mzxml_file_idx = ssf->file_idx;
}
if ( fseek(mzxml_stream,ssf->file_pos,0) )
{
cout << "Error: could not skip in file!" << endl;
exit(1);
}
num_peaks = get_peak_list_from_MZXML(mzxml_stream);
}
}
else if (ssf->type == DAT)
{
if (this->current_dat_file_idx != ssf->file_idx)
{
if (dat_file.is_open())
{
dat_file.close();
}
int file_idx = ssf->file_idx;
if (override_file_idx)
file_idx = 0;
const string& fname = fm.get_dat_file(file_idx).dat_name.c_str();
dat_file.open(fname.c_str(), ios::in | ios::binary);
if (! dat_file.is_open())
{
cout << "Error: couldn't open dat: " << fname << endl;
exit(1);
}
this->current_dat_file_idx = ssf->file_idx;
}
dat_file.seekg(ssf->file_pos);
// If the peaks are read from a DAT file, there is no need to perform a joining of the peaks
// num_peaks = get_peak_list_from_DAT(dat_file, &peak_list[0]);
num_peaks = get_peak_list_from_DAT(dat_file, peaks);
if (num_peaks<3)
num_peaks=-1;
return num_peaks;
}
else if (ssf->type == PKL)
{
int file_idx = ssf->file_idx;
if (override_file_idx)
file_idx = 0;
const string& pkl_name = fm.get_pkl_dir(file_idx).dir_path;
string file_path = pkl_name + "/" + ssf->single_name;
num_peaks = get_peak_list_from_PKL(file_path);
}
else if (ssf->type == MS2)
{
if (this->current_ms2_file_idx != ssf->file_idx)
{
if (ms2_stream)
{
fclose(ms2_stream);
}
const string& fname = fm.get_ms2_file(ssf->file_idx).ms2_name.c_str();
ms2_stream=fopen(fname.c_str(),"rb");
if (! mgf_stream)
{
cout << "Error: couldn't open ms2: " << fname << endl;
exit(1);
}
this->current_ms2_file_idx = ssf->file_idx;
}
if ( fseek(ms2_stream,ssf->file_pos,SEEK_SET) )
{
cout << "Error: could not skip in file!" << endl;
exit(1);
}
num_peaks = get_peak_list_from_MS2(ms2_stream);
}
else
{
cout << "Error: invalid file type:" << ssf->type << endl;
exit(1);
}
if (num_peaks<3)
num_peaks=-1;
// copy peaks as is
if (no_processing)
{
int i;
for (i=0; i<num_peaks; i++)
peaks[i]=peak_list[i];
return num_peaks;
}
// join adjacent peaks
const mass_t join_tolerance = (tolerance < 0.05 ? tolerance : 0.6 * tolerance);
int p_idx=0;
i=1;
while (i<num_peaks)
{
if (peak_list[i].mass - peak_list[p_idx].mass<=join_tolerance)
{
intensity_t inten_sum = peak_list[i].intensity + peak_list[p_idx].intensity;
mass_t new_mass = (peak_list[i].intensity * peak_list[i].mass +
peak_list[p_idx].intensity * peak_list[p_idx].mass ) / inten_sum;
peak_list[p_idx].mass = new_mass;
peak_list[p_idx].intensity = inten_sum;
}
else
{
peak_list[++p_idx]=peak_list[i];
}
i++;
}
num_peaks = p_idx+1;
// filter low intensity noise
// and mark those that are good peaks
int max_peak_idx = num_peaks -1;
int min_idx=1;
int max_idx=1;
p_idx =1;
vector<bool> indicators;
mark_top_peaks_with_sliding_window(&peak_list[0],
num_peaks,
config->get_local_window_size(),
config->get_max_number_peaks_per_local_window(),
indicators);
if (ssf->precursor_intensity <=0)
{
// cout << "PRecursor was zero!" << endl;
// exit(1);
ssf->precursor_intensity=0;
for (i=0; i<num_peaks; i++)
ssf->precursor_intensity+=peak_list[i].intensity;
}
p_idx=0;
for (i=0; i<num_peaks; i++)
if (indicators[i])
peaks[p_idx++]=peak_list[i];
num_peaks = p_idx;
// normalize intensities
if (config->get_need_to_normalize() && ! config->get_itraq_mode())
{
intensity_t total_inten=0;
for (i=0; i<num_peaks; i++)
total_inten+=peaks[i].intensity;
const mass_t one_over_total_inten = (1000.0 / total_inten);
for (i=0; i<num_peaks; i++)
peaks[i].intensity *= one_over_total_inten;
}
// sanity check
if (peaks[0].mass<=0 && peaks[0].intensity<0)
{
cout << "Invalid mass or intensity at first peak: " << peaks[0].mass << "\t" << peaks[0].intensity << endl;
exit(1);
}
for (i=1; i<num_peaks; i++)
{
if (peaks[i].intensity<0 || peaks[i].mass <0 || peaks[i].mass < peaks[i-1].mass)
{
cout << "Error: peak mismatches in file! (peak " << i << "/" << num_peaks-1 << ")" << endl;
cout << fixed << setprecision(4);
int j;
for (j=0; j<num_peaks; j++)
cout << peaks[j].mass << " " << peaks[j].intensity << endl;
exit(1);
}
}
return num_peaks;
}
// these functions just extract the peak list from the spectrum file, return the actual
// number of peaks (after joining)
int BasicSpecReader::get_peak_list_from_DTA(const char* dta_name)
{
ifstream fs(dta_name,ios::in);
if (! fs)
return -1;
char buff[256];
fs.getline(buff,256);
if (fs.bad())
return false;
while (buff[0] =='#')
{
fs.getline(buff,256);
}
int p_count=0;
while (fs.getline(buff,256))
{
istringstream is(buff);
mass_t& mass = peak_list[p_count].mass;
intensity_t& intensity = peak_list[p_count].intensity;
is >> mass >> intensity;
if (mass <0 || intensity==0) // the peak probably got rounded down
continue;
p_count++;
}
fs.close();
return p_count;
}
int BasicSpecReader::get_peak_list_from_MGF(FILE *mgf_stream)
{
char buff[1024];
mass_t peak_mass=-1;
intensity_t peak_intensity=-1;
while (1)
{
if( ! fgets(buff, 1024, mgf_stream) )
return -1;
if (! strncmp("END IONS",buff,7))
return -1;
istringstream is(buff);
is >> peak_mass >> peak_intensity;
if (peak_mass>0 && peak_intensity>0)
break;
}
peak_list[0].mass = peak_mass;
peak_list[0].intensity = peak_intensity;
int p_count = 1;
while (fgets(buff, 256, mgf_stream))
{
istringstream is(buff);
if (! strncmp("END IONS",buff,7))
break;
mass_t mass=-1;
intensity_t intensity=-1;
is >> mass >> intensity;
if (mass <=0 || intensity<=0) // the peak probably got rounded down
continue;
peak_list[p_count].mass = mass;
peak_list[p_count].intensity= intensity;
p_count++;
}
return p_count;
}
//
int BasicSpecReader::get_peak_list_from_DAT(ifstream& dat_file, QCPeak *peaks)
{
const int num_header_bytes_to_read = sizeof(mass_t) + 2* sizeof(float) + 4 * sizeof(int);
const int num_peaks_pos = sizeof(mass_t) + 3 *sizeof(int);
static char header_buff[num_header_bytes_to_read];
static vector<float> peak_buff;
static int peak_buff_size = 0;
memset(header_buff,0,num_header_bytes_to_read);
// read header info
if (! dat_file.read(header_buff,num_header_bytes_to_read))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -