📄 filemanagement.cpp
字号:
#include "FileManagement.h"
#include "auxfun.h"
void read_paths_into_list(const char *list_file, vector<string>& list)
{
ifstream fs(list_file,ios::in);
if (! fs.good())
{
cout << "Error: couldn't open file for reading: " << list_file << endl;
exit(1);
}
char buff[512];
list.clear();
while ( fs.getline(buff,512) )
{
string path(buff);
if (path.length()>3)
{
list.push_back(path);
}
}
}
// parses the type from the file name, -1 if not recognized
int get_file_extension_type(const string& fname)
{
int length = fname.length();
int last_pos = length-1;
while (last_pos>0 && (
fname[last_pos] == '\n' || fname[last_pos] == '\r' ||
fname[last_pos] == '\t' || fname[last_pos] == '\f') )
last_pos--;
if (fname[last_pos-2]=='d' && fname[last_pos-1]=='t' &&
fname[last_pos ]=='a')
return DTA;
if (fname[last_pos-2]=='m' && fname[last_pos-1]=='g' &&
fname[last_pos ]=='f')
return MGF;
if (fname[last_pos-4] == 'm' &&
fname[last_pos-3] == 'z' &&
fname[last_pos-2]=='X' && fname[last_pos-1]=='M' &&
fname[last_pos ]=='L')
return MZXML;
if (fname[last_pos-2]=='d' && fname[last_pos-1]=='a' &&
fname[last_pos ]=='t')
return DAT;
if (fname[last_pos-2]=='m' && fname[last_pos-1]=='s' &&
fname[last_pos ]=='2')
return MS2;
if (fname[last_pos-2]=='t' && fname[last_pos-1]=='x' &&
fname[last_pos ]=='t')
return TXT;
return -1;
}
int SingleSpectrumFile::get_scan() const
{
if (type == MGF)
return ((MGF_single *)this)->idx_in_file;
if (type == MZXML)
return ((MZXML_single *)this)->scan_number;
if (type == DAT)
return ((DAT_single *)this)->scan_number;
return 0;
}
/****************************************************************
quickly extracts the charge, sequence, and pm_with_19 from dta
*****************************************************************/
bool DTA_file::scan_dta(const string& fname, const Config *config)
{
char buff[1024];
peptide.clear();
ifstream fs(fname.c_str(),ios::in);
if (! fs.good())
{
cout << "Error: couldn't open "<< fname << endl;
exit(1);
}
fs.getline(buff,1024);
while (buff[0] =='#')
{
if (! strncmp(buff,"#SEQ",4))
peptide.parse_from_string(config,buff+5);
if (! fs.getline(buff,1024))
{
cout << "Error scanning " << fname << endl;
exit(1);
}
continue;
}
istringstream is(buff);
org_pm_with_19 = -1;
charge = -1;
is >> org_pm_with_19 >> charge;
// charge = 0; // bad fix
if (org_pm_with_19<0 || charge <0)
{
cout << "Error: couldn't find pm and charge in DTA file, pm: "
<< org_pm_with_19 << " , charge: " << charge << endl;
exit(1);
}
if (charge>0)
{
m_over_z = (org_pm_with_19 + charge -1 ) / (mass_t)charge;
}
if (peptide.get_num_aas()>0 && charge>0)
{
mass_t diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
if (fabs(diff)>6.0)
{
// try and correct charge!
for (charge=1; charge<=3; charge++)
{
org_pm_with_19 = m_over_z * charge + MASS_PROTON * (1 - charge);
diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
if (diff<5)
break;
}
if (diff>7)
{
cout << "Error: sequence mass doesn't add up: " << fname << " (diff: "
<< diff << ")" << endl;
cout << "Pepitde: " << peptide.as_string(config) << endl;
cout << "Mass Cys = " << config->get_session_tables().get_aa2mass(Cys) << endl;
}
// exit(1);
}
}
num_peaks=0;
while (! fs.eof())
{
fs.getline(buff,64);
float mass,inten;
if (sscanf(buff,"%f %f",&mass,&inten)==2)
{
if (mass<0 || inten<=0)
continue;
num_peaks++;
}
}
fs.close();
return true;
}
/****************************************************************
quickly extracts the charge, sequence, and pm_with_19 from MGF location
*****************************************************************/
bool MGF_single::scan_mgf_single(FILE *stream, const Config *config)
{
char buff[1024];
mass_t exp_pm=-1;
peptide.clear();
while (fgets(buff, 256, stream))
{
file_pos = ftell(stream);
if (strncmp(buff,"BEGIN IONS",10) )
{
continue;
}
break;
}
charge=0;
m_over_z=-1;
// read header info and first peak
while (1)
{
if( ! fgets(buff, 1024, stream))
return false;
if (! strncmp(buff,"END IONS",7))
return false;
if (! strncmp(buff,"TITLE=",6) )
{
int len = strlen(buff)-1;
buff[len]='\0';
if (buff[len-1]=='\r' || buff[len-1]=='\n' )
buff[len-1]='\0';
single_name = buff + 6;
continue;
}
else
if (! strncmp(buff,"SEQ=",4) )
{
string seq_string(buff+4);
peptide.parse_from_string(config,seq_string);
peptide.calc_mass(config);
continue;
}
else
if (! strncmp(buff,"SCAN=",5) )
{
if (sscanf(buff+5,"%d",&scan_number) != 1)
{
cout << "Error: couldn't read scan number!" << endl;
exit(1);
}
continue;
}
else
if (! strncmp(buff,"RT=",3) )
{
if (sscanf(buff+3,"%f",&retention_time) != 1)
{
cout << "Error: couldn't read retention_time!" << endl;
exit(1);
}
continue;
}
else
if (! strncmp(buff,"CLUSTER_SIZE=",13) )
{
if (sscanf(buff+13,"%d",&cluster_size) != 1)
{
cout << "Error: couldn't read cluster size!" << endl;
exit(1);
}
continue;
}
else
if ( ! strncmp(buff,"CHARGE=",6))
{
if (sscanf(buff,"CHARGE=%d",&charge) != 1)
{
cout << "Error: couldn't read charge!" << endl;
return false;
}
}
else
if (! strncmp(buff,"PEPMASS=",8))
{
istringstream is(buff+8);
is >> m_over_z;
if (m_over_z < 0)
{
cout << "Error: reading pepmass:" << m_over_z << endl;
return false;
}
}
else
{
istringstream is(buff);
Peak p;
is >> p.mass >> p.intensity;
if (p.mass >0 && p.intensity>0)
{
first_peak_mass = p.mass;
break;
}
}
}
if (charge<0 || m_over_z<0)
return false;
org_pm_with_19 = m_over_z * charge + MASS_PROTON * (1 - charge);
if (org_pm_with_19<0)
{
org_pm_with_19=-1;
}
num_peaks=0;
while ( fgets(buff, 1024, stream) )
{
if (! strncmp(buff,"END IONS",8) )
break;
num_peaks++;
}
if (peptide.get_num_aas()>0 && charge>0)
{
mass_t diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
if (fabs(diff)>7.0)
{
// try and correct charge!
int c;
for (c=1; c<=4; c++)
{
mass_t new_pm_with_19 = m_over_z * c + MASS_PROTON * (1 - c);
diff = fabs(new_pm_with_19 - peptide.get_mass() - MASS_OHHH);
if (diff<10)
{
charge = c;
org_pm_with_19 = new_pm_with_19;
return true;
}
}
cout << "Error: MGF sequence mass doesn't add up: " << single_name << " (diff: "
<< diff << ")" << endl;
cout << "m/z: " << m_over_z << " org_pm_with_19: " << org_pm_with_19 << endl;
cout << "Pepitde: " << peptide.as_string(config) << " (" << peptide.get_mass() << ")" << endl;
cout << "Mass Cys = " << config->get_session_tables().get_aa2mass(Cys) << endl;
// org_pm_with_19 = -1;
return false;
}
}
return true;
}
/****************************************************************
quickly extracts the charge, sequence, and pm_with_19 from MS2 location
*****************************************************************/
bool MS2_single::scan_ms2_single(FILE *stream, const Config *config)
{
char buff[128];
mass_t exp_pm=-1;
peptide.clear();
while (fgets(buff, 256, stream))
{
if (! strncmp(buff,":",1) )
break;
}
if (feof(stream))
return true;
buff[strlen(buff)-1]='\0';
single_name = buff+1;
charge=-1;
org_pm_with_19=-1;
// read header info and first peak
if( ! fgets(buff, 1024, stream))
return false;
if (! strncmp(buff,"SEQ=",4) )
{
string seq_string(buff+4);
peptide.parse_from_string(config,seq_string);
if( ! fgets(buff, 1024, stream))
return false;
}
istringstream is(buff);
is >> org_pm_with_19 >> charge;
if (org_pm_with_19<0 || org_pm_with_19>10000)
{
cout << "Error: pm_with_19 in MS2 not in range: " << org_pm_with_19 << endl;
exit(1);
}
if (charge<=0 || charge>10)
{
cout << "Error: bad charge state in MS2: " << charge << endl;
exit(1);
}
m_over_z = (org_pm_with_19 -1 + charge) / charge;
num_peaks=0;
while ( fgets(buff, 128, stream) )
{
if (! strncmp(buff,":",1) || strlen(buff)< 3 )
break;
num_peaks++;
}
if (peptide.get_num_aas()>0 && charge>0)
{
mass_t diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
if (fabs(diff)>7.0)
{
cout << "Error: sequence mass doesn't add up: " << single_name << " (diff: "
<< diff << ")" << endl;
cout << "m/z: " << m_over_z << " org_pm_with_19: " << org_pm_with_19 << endl;
cout << "Pepitde: " << peptide.as_string(config) << " (" << peptide.get_mass() << ")" << endl;
cout << "Mass Cys = " << config->get_session_tables().get_aa2mass(Cys) << endl;
exit(1);
}
}
return true;
}
/*******************************************************************
Special function to overcome mzMXL parsing problems.
Reads the peak lists into a local buff of floats.
********************************************************************/
void FileManager::init_and_read_single_mzXML(
Config *config,
const char * file_name,
int file_idx,
mass_t min_m_over_z,
mass_t max_m_over_z)
{
min_charge =9999;
max_charge =0;
num_spectra.resize(100,0);
mzxml_files.resize(1);
mzxml_files[0].mzxml_name = file_name;
int mzxml_file_idx = (file_idx>=0 ? file_idx : 0);
mzxml_files[0].extract_peak_lists_from_mzXML(config,mzxml_files[0].mzxml_name,mzxml_file_idx,0,10000);
min_spec_mass = mzxml_files[0].min_spec_mass;
max_spec_mass = mzxml_files[0].max_spec_mass;
min_charge = mzxml_files[0].min_charge;
max_charge = mzxml_files[0].max_charge;
count_num_spectra();
// print_summary_stats();
}
void FileManager::init_from_list(Config *config, const vector<string>& list, bool quick_flag,
int file_idx)
{
int i,num_dta=0,num_mgf=0,num_mzxml=0,num_dat=0,num_ms2=0;
min_charge =9999;
max_charge =0;
// cout << "init from list: " << list.size() << endl;
num_spectra.resize(100,0);
for (i=0; i<list.size(); i++)
{
if (list[i][0] == '#')
continue;
int last_pos = list[i].length()-1;
if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' ||
list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
last_pos--;
if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' &&
list[i][last_pos ]=='a')
{
num_dta++;
}
else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' &&
list[i][last_pos ]=='f')
{
num_mgf++;
}
else if (list[i][last_pos-4] == 'm' &&
list[i][last_pos-3] == 'z' &&
list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' &&
list[i][last_pos ]=='L')
{
num_mzxml++;
}
else if (
list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' &&
list[i][last_pos ]=='t')
{
num_dat++;
}
else if (
list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' &&
list[i][last_pos ]=='2')
{
num_ms2++;
}
else
{
cout << "Error::: couldn't recognize file type for: " << list[i] << endl;
exit(1);
}
}
dta_files.resize(num_dta);
mgf_files.resize(num_mgf);
mzxml_files.resize(num_mzxml);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -