📄 filemanagement.cpp
字号:
{
if (mgf_name.length() == 0)
{
printf("Error: must first copy name to MGF_file!\n");
exit(1);
}
FILE *stream=fopen(mgf_name.c_str(),"r");
if (! stream)
{
cout << "Error: couldn't open mgf file for reading: " << mgf_name << endl;
exit(1);
}
// cout << "TTT Reading mgf idx: " << file_idx << endl;
this->single_spectra.clear();
this->single_spectra.reserve(5000);
this->num_spectra.resize(100,0);
Spectrum s;
int counter=0;
while (1)
{
MGF_single mg;
long pos = ftell(stream); // remember stream pointer
mg.type = MGF;
mg.file_pos = -1;
mg.file_idx = file_idx;
mg.idx_in_file = counter++;
if (quick_flag)
{
if (! mg.scan_mgf_single(stream,config) )
{
static char tmp_buff[256];
if (! fgets(tmp_buff,256,stream)) // move forwards, there was a problem with that position
break; // reached eof
continue;
}
// there was something wrong with this spectrum
if (mg.org_pm_with_19<0)
continue;
if (mg.org_pm_with_19 < min_spec_mass)
min_spec_mass = mg.org_pm_with_19;
if (mg.org_pm_with_19 > max_spec_mass)
max_spec_mass = mg.org_pm_with_19;
if (mg.charge <min_charge)
min_charge = mg.charge;
if (mg.charge >max_charge)
max_charge = mg.charge;
num_spectra[mg.charge]++;
}
else
{
mg.file_pos = ftell(stream);
if (! s.read_from_MGF_stream(config,stream) )
break;
s.init_spectrum();
mg.peptide = s.get_peptide();
mg.single_name = s.get_file_name();
mg.org_pm_with_19 = s.get_org_pm_with_19();
mg.charge = s.get_charge();
mg.pm_with_19 = (s.get_corrected_pm_with_19()>0 ?s.get_corrected_pm_with_19() : s.get_org_pm_with_19());
mg.num_peaks = s.get_num_peaks();
if (mg.org_pm_with_19 < min_spec_mass)
min_spec_mass = mg.org_pm_with_19;
if (mg.org_pm_with_19 > max_spec_mass)
max_spec_mass = mg.org_pm_with_19;
if (mg.charge <min_charge)
min_charge = mg.charge;
if (mg.charge >max_charge)
max_charge = mg.charge;
num_spectra[mg.charge]++;
}
if (mg.file_pos>=0)
{
single_spectra.push_back(mg);
}
}
fclose(stream);
}
void DAT_file::initial_read(Config *config, int file_idx)
{
const int bytes_to_read = sizeof(mass_t) + 4 * sizeof(int) + 2 *sizeof(float);
if (dat_name.length() == 0)
{
printf("Error: must first copy name to DAT_file!\n");
exit(1);
}
ifstream dat_stream(dat_name.c_str(),ios::in | ios::binary);
if (! dat_stream.is_open() || ! dat_stream.good())
{
cout << "Error: couldn't open dat file for reading: " << dat_name << endl;
exit(1);
}
this->single_spectra.clear();
this->single_spectra.reserve(20000);
this->num_spectra.resize(100,0);
int counter=0;
while (1)
{
char header_buff[bytes_to_read];
long pos = dat_stream.tellg();
DAT_single dat;
dat.type = DAT;
dat.file_pos = pos;
dat.file_idx = file_idx;
dat_stream.read(header_buff,bytes_to_read);
if (dat_stream.gcount() != bytes_to_read)
break;
char *b_pos = header_buff;
dat.m_over_z = *(mass_t *)b_pos;
b_pos += sizeof(mass_t);
dat.charge = *(int *)b_pos;
b_pos += sizeof(int);
dat.mzxml_file_idx = *(int *)b_pos;
b_pos += sizeof(int);
dat.scan_number = *(int *)b_pos;
b_pos += sizeof(int);
dat.num_peaks = *(int *)b_pos;
b_pos += sizeof(int);
dat.retention_time = *(float *)b_pos;
b_pos += sizeof(float);
dat.precursor_intensity = *(float *)b_pos;
b_pos += sizeof(float);
// sanity checks
if (dat.m_over_z<0.0 || dat.m_over_z>10000.0 ||
dat.mzxml_file_idx<0 || dat.mzxml_file_idx>100000 ||
dat.num_peaks<0 || dat.num_peaks>100000 ||
dat.scan_number<0 || dat.scan_number>1000000)
{
cout << "Error in DAT file " << file_idx << " pos " << pos << " #" <<
single_spectra.size() << endl;
cout << "m/z = " << dat.m_over_z << endl;
cout << "mzxml idx = " << dat.mzxml_file_idx << endl;
cout << "scan = " << dat.scan_number << endl;
cout << "num_peaks = " << dat.num_peaks << endl;
exit(1);
}
dat_stream.seekg(pos + bytes_to_read + 2*sizeof(float)*dat.num_peaks);
single_spectra.push_back(dat);
counter++;
}
dat_stream.close();
}
void MS2_file::initial_read(Config *config, int file_idx, bool quick_flag)
{
if (ms2_name.length() == 0)
{
printf("Error: must first copy name to MS2_file!\n");
exit(1);
}
FILE *stream=fopen(ms2_name.c_str(),"r");
if (! stream)
{
cout << "Error: couldn't open mgf file for reading: " << ms2_name << endl;
exit(1);
}
this->single_spectra.clear();
this->single_spectra.reserve(5000);
this->num_spectra.resize(100,0);
Spectrum s;
int counter=0;
while (1)
{
MS2_single ms2;
long pos = ftell(stream); // remember stream pointer
ms2.type = MS2;
ms2.file_pos = pos;
ms2.file_idx = file_idx;
ms2.idx_in_file = counter++;
if (quick_flag)
{
if (! ms2.scan_ms2_single(stream,config) )
break;
if (ms2.org_pm_with_19 < min_spec_mass)
min_spec_mass = ms2.org_pm_with_19;
if (ms2.org_pm_with_19 > max_spec_mass)
max_spec_mass = ms2.org_pm_with_19;
if (ms2.charge <min_charge)
min_charge = ms2.charge;
if (ms2.charge >max_charge)
max_charge = ms2.charge;
num_spectra[ms2.charge]++;
}
else
{
cout << "Reading MS2 not supported in this mode!" << endl;
exit(0);
/* if (! s.read_from_MS2_stream(config,stream) )
break;
s.init_spectrum();
ms2.peptide = s.get_peptide();
ms2.single_name = s.get_file_name();
ms2.org_pm_with_19 = s.get_org_pm_with_19();
ms2.charge = s.get_charge();
ms2.pm_with_19 = s.get_corrected_pm_with_19();
ms2.num_peaks = s.get_num_peaks();
if (ms2.org_pm_with_19 < min_spec_mass)
min_spec_mass = ms2.org_pm_with_19;
if (ms2.org_pm_with_19 > max_spec_mass)
max_spec_mass = ms2.org_pm_with_19;
if (ms2.charge <min_charge)
min_charge = ms2.charge;
if (ms2.charge >max_charge)
max_charge = ms2.charge;
num_spectra[ms2.charge]++;*/
}
single_spectra.push_back(ms2);
}
fclose(stream);
}
// reads the summary tsv file and stores stats
// extract all information from the tsv file
void PKL_dir::initial_read(Config *config, int dir_idx, const string& path,
const string& tsv_file , mass_t min_m_over_z, mass_t max_m_over_z)
{
this->dir_path = path;
this->tsv_path = tsv_file;
FILE *tsv_stream = fopen(tsv_file.c_str(),"r");
if (! tsv_stream)
{
cout << "Error: couldn't open tsv file for reading: " << tsv_file << endl;
exit(1);
}
char line_buff[2048];
fgets(line_buff,2048,tsv_stream); // first row is headers
while (fgets(line_buff,2048,tsv_stream))
{
int np=-1;
PKL_single pkl;
pkl.type = PKL;
pkl.file_idx = dir_idx;
pkl.m_over_z = -1;
pkl.single_name = "";
pkl.num_peaks=-1;
istringstream is(line_buff);
mass_t mz_sel,mz_cen,parent_MH;
int tmp_charge, ns;
float max_inten;
is >> pkl.single_name >> pkl.scan_number >> pkl.retention_time >> np >>
pkl.num_peaks >> mz_sel >> pkl.m_over_z >> mz_cen >> parent_MH >> tmp_charge >> ns >>
max_inten >> pkl.precursor_intensity;
if (pkl.num_peaks<=5)
continue;
if (pkl.m_over_z < min_m_over_z || pkl.m_over_z>max_m_over_z)
continue;
// set charge from file name
pkl.charge = 0;
if (pkl.single_name.length()>4)
{
char charge_sym = pkl.single_name[pkl.single_name.length()-5];
if (charge_sym<'0' || charge_sym>'9')
{
cout << "Error: couldn't extract charge from file name " << pkl.single_name << endl;
exit(1);
}
pkl.charge = int(charge_sym - '0');
}
single_spectra.push_back(pkl);
}
fclose(tsv_stream);
}
// reads a list with dirs and paths to tsv files
void FileManager::init_from_pkl_dir_list(Config *config, const char *list_file,
mass_t min_m_over_z, mass_t max_m_over_z)
{
FILE *list_stream=fopen(list_file,"r");
if (! list_stream)
{
cout << "Error: couldn't open pkl dir list for reading: " << list_file << endl;
exit(1);
}
char line_buff[1024];
int n=0;
while (fgets(line_buff,1024,list_stream))
n++;
fclose(list_stream);
pkl_dirs.resize(n);
list_stream=fopen(list_file,"r");
cout << "Read pkl files (idx path #spectra):" << endl;
int pkl_dir_idx=0;
while (fgets(line_buff,1024,list_stream))
{
istringstream is(line_buff);
string pkl_dir = "";
string tsv_file = "";
is >> pkl_dir >> tsv_file;
if (pkl_dir.length()>2 && tsv_file.length() >2)
{
pkl_dirs[pkl_dir_idx].initial_read(config,pkl_dir_idx,pkl_dir,tsv_file,
min_m_over_z, max_m_over_z);
if (pkl_dirs[pkl_dir_idx].single_spectra.size()>0)
{
cout << pkl_dir_idx << "\t" << pkl_dir << "\t" << pkl_dirs[pkl_dir_idx].single_spectra.size() << endl;
pkl_dir_idx++;
}
}
}
}
void FileManager::init_from_single_pkl_dir(Config *config, const string& pkl_dir_path,
const string& tsv_file, int pkl_dir_idx, mass_t min_m_over_z, mass_t max_m_over_z)
{
pkl_dirs.resize(1);
pkl_dirs[0].initial_read(config, pkl_dir_idx, pkl_dir_path, tsv_file,
min_m_over_z, max_m_over_z);
}
/*************************************************************************
Select all file pointers from the FileManager
**************************************************************************/
void FileSet::select_all_files(const FileManager& fm, bool remove_duplicates)
{
int i;
this->ssf_pointers.clear();
for (i=0; i<fm.dta_files.size(); i++)
{
ssf_pointers.push_back((struct DTA_file *)&fm.dta_files[i]);
// check that this doesn't correpond to the immediate previous spec
if (remove_duplicates && ssf_pointers.size()>1)
{
const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];
if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
{
ssf_pointers.pop_back();
}
}
}
for (i=0; i<fm.mgf_files.size(); i++)
{
int j;
for (j=0; j<fm.mgf_files[i].single_spectra.size(); j++)
{
const SingleSpectrumFile *ssf = &fm.mgf_files[i].single_spectra[j];
if (ssf->num_peaks<5)
continue;
this->ssf_pointers.push_back((struct MGF_single *)ssf);
// check that this doesn't correpond to the immediate previous spec
if (remove_duplicates && ssf_pointers.size()>1)
{
const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];
if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
{
ssf_pointers.pop_back();
continue;
}
}
// cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
}
}
for (i=0; i<fm.mzxml_files.size(); i++)
{
int j;
for (j=0; j<fm.mzxml_files[i].single_spectra.size(); j++)
{
const SingleSpectrumFile *ssf = &fm.mzxml_files[i].single_spectra[j];
if (ssf->num_peaks<5)
continue;
this->ssf_pointers.push_back((struct MZXML_single *)ssf);
// check that this doesn't correpond to the immediate previous spec
if (remove_duplicates && ssf_pointers.size()>1)
{
const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];
if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
{
ssf_pointers.pop_back();
continue;
}
}
// cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
}
}
for (i=0; i<fm.dat_files.size(); i++)
{
int j;
for (j=0; j<fm.dat_files[i].single_spectra.size(); j++)
{
const SingleSpectrumFile *ssf = &fm.dat_files[i].single_spectra[j];
// if (ssf->num_peaks<5)
// continue;
this->ssf_pointers.push_back((struct DAT_single *)ssf);
// check that this doesn't correpond to the immediate previous spec
if (remove_duplicates && ssf_pointers.size()>1)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -