📄 filemanagement.cpp
字号:
dat_files.resize(num_dat);
ms2_files.resize(num_ms2);
int mgf_c=0;
int dta_c=0;
int mzxml_c=0;
int dat_c=0;
int ms2_c=0;
for (i=0; i<list.size(); i++)
{
if (list[i][0] == '#')
continue;
int last_pos = list[i].length()-1;
if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' ||
list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
last_pos--;
if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' &&
list[i][last_pos ]=='a')
{
dta_files[dta_c].single_name = list[i];
if (quick_flag)
{
dta_files[dta_c].scan_dta(list[i],config);
}
else
dta_files[dta_c].initial_read(config);
dta_files[dta_c].type = DTA;
dta_files[dta_c].file_idx = dta_c;
if (dta_files[dta_c].org_pm_with_19 < min_spec_mass)
min_spec_mass = dta_files[dta_c].org_pm_with_19;
if (dta_files[dta_c].org_pm_with_19 > max_spec_mass)
max_spec_mass = dta_files[dta_c].org_pm_with_19;
if (dta_files[dta_c].charge < min_charge)
min_charge = dta_files[dta_c].charge;
if (dta_files[dta_c].charge > max_charge)
max_charge = dta_files[dta_c].charge;
num_spectra[dta_files[dta_c].charge]++;
dta_c++;
}
else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' &&
list[i][last_pos ]=='f')
{
mgf_files[mgf_c].mgf_name =list[i];
// cout << "DD1 reading idx " << mgf_c << endl;
int mgf_file_idx = (file_idx>=0 ? mgf_c + file_idx : mgf_c);
mgf_files[mgf_c].initial_read(config,mgf_file_idx, quick_flag);
if (mgf_files[mgf_c].min_spec_mass<min_spec_mass)
min_spec_mass = mgf_files[mgf_c].min_spec_mass;
if (mgf_files[mgf_c].max_spec_mass>max_spec_mass)
max_spec_mass = mgf_files[mgf_c].max_spec_mass;
if (mgf_files[mgf_c].min_charge < min_charge)
min_charge = mgf_files[mgf_c].min_charge;
if (mgf_files[mgf_c].max_charge > max_charge)
max_charge = mgf_files[mgf_c].max_charge;
int c;
for (c=0; c<=max_charge; c++)
num_spectra[c] += mgf_files[mgf_c].num_spectra[c];
mgf_c++;
}
else if (list[i][last_pos-4] == 'm' &&
list[i][last_pos-3] == 'z' &&
list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' &&
list[i][last_pos ]=='L')
{
mzxml_files[mzxml_c].mzxml_name =list[i];
int mzxml_file_idx = (file_idx>=0 ? mzxml_c + file_idx : mzxml_c);
mzxml_files[mzxml_c].initial_read(config,mzxml_c);
if (mzxml_files[mzxml_c].min_spec_mass<min_spec_mass)
min_spec_mass = mzxml_files[mzxml_c].min_spec_mass;
if (mzxml_files[mzxml_c].max_spec_mass>max_spec_mass)
max_spec_mass = mzxml_files[mzxml_c].max_spec_mass;
if (mzxml_files[mzxml_c].min_charge < min_charge)
min_charge = mzxml_files[mzxml_c].min_charge;
if (mzxml_files[mzxml_c].max_charge > max_charge)
max_charge = mzxml_files[mzxml_c].max_charge;
// int c;
// for (c=0; c<=max_charge; c++)
// num_spectra[c] += mzxml_files[mzxml_c].num_spectra[c];
mzxml_c++;
}
else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' &&
list[i][last_pos ]=='t')
{
dat_files[dat_c].dat_name =list[i];
int dat_file_idx = (file_idx>=0 ? dat_c + file_idx : dat_c);
dat_files[dat_c].initial_read(config,dat_file_idx);
if (dat_files[dat_c].min_spec_mass<min_spec_mass)
min_spec_mass = dat_files[dat_c].min_spec_mass;
if (dat_files[dat_c].max_spec_mass>max_spec_mass)
max_spec_mass = dat_files[dat_c].max_spec_mass;
if (dat_files[dat_c].min_charge < min_charge)
min_charge = dat_files[dat_c].min_charge;
if (dat_files[dat_c].max_charge > max_charge)
max_charge = dat_files[dat_c].max_charge;
int c;
for (c=0; c<=max_charge; c++)
num_spectra[c] += dat_files[dat_c].num_spectra[c];
dat_c++;
}
else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' &&
list[i][last_pos ]=='2')
{
ms2_files[ms2_c].ms2_name =list[i];
ms2_files[ms2_c].initial_read(config,ms2_c, quick_flag);
if (ms2_files[ms2_c].min_spec_mass<min_spec_mass)
min_spec_mass = ms2_files[ms2_c].min_spec_mass;
if (ms2_files[ms2_c].max_spec_mass>max_spec_mass)
max_spec_mass = ms2_files[ms2_c].max_spec_mass;
if (ms2_files[ms2_c].min_charge < min_charge)
min_charge = ms2_files[ms2_c].min_charge;
if (mgf_files[ms2_c].max_charge > max_charge)
max_charge = ms2_files[ms2_c].max_charge;
int c;
for (c=0; c<=max_charge; c++)
num_spectra[c] += ms2_files[ms2_c].num_spectra[c];
ms2_c++;
}
else
{
cout << "Error :: couldn't recognize file type for: " << list[i] << endl;
exit(1);
}
}
count_num_spectra();
// print_summary_stats();
}
/*****************************************************************
returns how many spectra are present in the list file
also samples m_over_z values to generate an approximate
histogram in case the set of spectra needs to be spilt.
******************************************************************/
int FileManager::count_num_spectra(Config *config, const char* list_file,
vector<mass_t>& mass_histogram) const
{
vector<string> list;
read_paths_into_list(list_file,list);
int i,num_spectra_read=0;
vector<mass_t> masses;
int set_size = 1000; // number of spectra that are examined for mass collection before fulsing
masses.clear();
mass_histogram.clear();
for (i=0; i<list.size(); i++)
{
if (list[i][0] == '#')
continue;
int last_pos = list[i].length()-1;
if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' ||
list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
last_pos--;
if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' &&
list[i][last_pos ]=='a')
{
DTA_file dta_file;
masses.push_back(dta_file.m_over_z);
num_spectra_read++;
}
else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' &&
list[i][last_pos ]=='f')
{
MGF_file mgf_file;
mgf_file.mgf_name =list[i];
mgf_file.initial_read(config,0,true);
int j;
for (j=0; j<mgf_file.single_spectra.size(); j++)
masses.push_back(mgf_file.single_spectra[j].m_over_z);
num_spectra_read+=mgf_file.single_spectra.size();
}
else if (list[i][last_pos-4] == 'm' &&
list[i][last_pos-3] == 'z' &&
list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' &&
list[i][last_pos ]=='L')
{
MZXML_file mzxml_file;
mzxml_file.mzxml_name =list[i];
mzxml_file.initial_read(config,0);
int j;
for (j=0; j<mzxml_file.single_spectra.size(); j++)
masses.push_back(mzxml_file.single_spectra[j].m_over_z);
num_spectra_read+=mzxml_file.single_spectra.size();
}
else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' &&
list[i][last_pos ]=='t')
{
DAT_file dat_file;
dat_file.dat_name =list[i];
dat_file.initial_read(config,0);
int j;
for (j=0; j<dat_file.single_spectra.size(); j++)
masses.push_back(dat_file.single_spectra[j].m_over_z);
num_spectra_read+=dat_file.single_spectra.size();
}
else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' &&
list[i][last_pos ]=='2')
{
MS2_file ms2_file;
ms2_file.ms2_name =list[i];
ms2_file.initial_read(config,0);
int j;
for (j=0; j<ms2_file.single_spectra.size(); j++)
masses.push_back(ms2_file.single_spectra[j].m_over_z);
num_spectra_read+=ms2_file.single_spectra.size();
}
else
{
cout << "Error: couldn't recognize file type for: " << list[i] << endl;
exit(1);
}
// if enough spectra were read, sample their masses and clear buffer
if (masses.size()>set_size)
{
// sample masses per set size
int num_sample = (int)((masses.size()/set_size)*10 + 0.5);
int j;
for (j=0; j<num_sample; j++)
{
int idx = (int)(my_random() * masses.size());
mass_histogram.push_back(masses[idx]);
}
masses.clear();
}
if (mass_histogram.size() == 100000)
{
vector<mass_t> tmp_his;
int j;
for (j=0; j<set_size; j++)
if (my_random()>=0.5)
tmp_his.push_back(mass_histogram[j]);
mass_histogram = tmp_his;
tmp_his.clear();
set_size *= 2;
}
}
return num_spectra_read;
}
// counts how many total spectra are available from each charge
void FileManager::count_num_spectra()
{
num_spectra.clear();
num_spectra.resize(20,0);
min_charge =99;
max_charge =-1;
int i;
for (i=0; i<dta_files.size(); i++)
{
int charge =dta_files[i].charge;
num_spectra[charge]++;
if (charge<min_charge)
min_charge = charge;
if (charge>max_charge)
max_charge = charge;
}
for (i=0; i<mgf_files.size(); i++)
{
int j;
for (j=0; j<mgf_files[i].single_spectra.size(); j++)
{
int charge =mgf_files[i].single_spectra[j].charge;
num_spectra[charge]++;
if (charge<min_charge)
min_charge = charge;
if (charge>max_charge)
max_charge = charge;
}
}
for (i=0; i<mzxml_files.size(); i++)
{
int j;
for (j=0; j<mzxml_files[i].single_spectra.size(); j++)
{
int charge =mzxml_files[i].single_spectra[j].charge;
num_spectra[charge]++;
if (charge<min_charge)
min_charge = charge;
if (charge>max_charge)
max_charge = charge;
}
}
for (i=0; i<dat_files.size(); i++)
{
int j;
for (j=0; j<dat_files[i].single_spectra.size(); j++)
{
int charge =dat_files[i].single_spectra[j].charge;
num_spectra[charge]++;
if (charge<min_charge)
min_charge = charge;
if (charge>max_charge)
max_charge = charge;
}
}
total_num_spectra=0;
for (i=0; i<num_spectra.size(); i++)
total_num_spectra += num_spectra[i];
}
// Inits the FileManager using mass levels (for very large
// collections of spectra). This initialization uses a quick scan
void FileManager::init_from_list_file(Config *config, const char* list_file,
mass_t min_m_over_z, mass_t max_m_over_z)
{
int i;
vector<string> list;
read_paths_into_list(list_file,list);
num_spectra.resize(20,0);
for (i=0; i<list.size(); i++)
{
if (list[i][0] == '#')
continue;
int last_pos = list[i].length()-1;
if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' ||
list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
last_pos--;
if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' &&
list[i][last_pos ]=='a')
{
DTA_file dta_file;
dta_file.scan_dta(list[i],config);
if (dta_file.m_over_z>=min_m_over_z && dta_file.m_over_z<max_m_over_z)
dta_files.push_back(dta_file);
}
else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' &&
list[i][last_pos ]=='f')
{
MGF_file mgf_file;
mgf_file.mgf_name =list[i];
mgf_file.initial_read(config,mgf_files.size(),true);
// change the single spectrum pointers in the mgf file record
// to include only those that have a mass that is in the permitted range
vector<MGF_single> good_singles;
int j;
for (j=0; j<mgf_file.single_spectra.size(); j++)
{
if (mgf_file.single_spectra[j].m_over_z >= min_m_over_z &&
mgf_file.single_spectra[j].m_over_z < max_m_over_z)
{
good_singles.push_back(mgf_file.single_spectra[j]);
}
}
mgf_file.single_spectra = good_singles;
mgf_files.push_back(mgf_file);
}
else if (list[i][last_pos-4] == 'm' &&
list[i][last_pos-3] == 'z' &&
list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' &&
list[i][last_pos ]=='L')
{
MZXML_file mzxml;
mzxml.mzxml_name =list[i];
mzxml.initial_read(config,mzxml_files.size());
// change the single spectrum pointers in the mgf file record
// to include only those that have a mass that is in the permitted range
vector<MZXML_single> good_singles;
int j;
for (j=0; j<mzxml.single_spectra.size(); j++)
{
if (mzxml.single_spectra[j].m_over_z >= min_m_over_z &&
mzxml.single_spectra[j].m_over_z < max_m_over_z)
{
good_singles.push_back(mzxml.single_spectra[j]);
}
}
mzxml.single_spectra = good_singles;
mzxml_files.push_back(mzxml);
}
else if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' &&
list[i][last_pos ]=='t')
{
DAT_file dat;
dat.dat_name =list[i];
dat.initial_read(config,dat_files.size());
// change the single spectrum pointers in the mgf file record
// to include only those that have a mass that is in the permitted range
vector<DAT_single> good_singles;
int j;
for (j=0; j<dat.single_spectra.size(); j++)
{
if (dat.single_spectra[j].m_over_z >= min_m_over_z &&
dat.single_spectra[j].m_over_z < max_m_over_z)
{
good_singles.push_back(dat.single_spectra[j]);
}
}
dat.single_spectra = good_singles;
dat_files.push_back(dat);
}
else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' &&
list[i][last_pos ]=='2')
{
MS2_file ms2_file;
ms2_file.ms2_name =list[i];
ms2_file.initial_read(config,ms2_files.size(),true);
// change the single spectrum pointers in the mgf file record
// to include only those that have a mass that is in the permitted range
vector<MS2_single> good_singles;
int j;
for (j=0; j<ms2_file.single_spectra.size(); j++)
{
if (ms2_file.single_spectra[j].m_over_z >= min_m_over_z &&
ms2_file.single_spectra[j].m_over_z < max_m_over_z)
{
good_singles.push_back(ms2_file.single_spectra[j]);
}
}
ms2_file.single_spectra = good_singles;
ms2_files.push_back(ms2_file);
}
else
{
cout << "Error: couldn't recognize file type for:: " << list[i] << endl;
exit(1);
}
}
count_num_spectra();
}
// Inits the FileManager using mass levels (for very large
// collections of spectra). This initialization uses a quick scan
// Only for files whose idx is true in file indicators
void FileManager::init_from_list_file(Config *config, const char* list_file,
const vector<bool>& file_indicators)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -