📄 filemanagement.cpp
字号:
{
const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];
if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
{
ssf_pointers.pop_back();
continue;
}
}
// cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
}
}
for (i=0; i<fm.pkl_dirs.size(); i++)
{
int j;
for (j=0; j<fm.pkl_dirs[i].single_spectra.size(); j++)
{
const SingleSpectrumFile *ssf = &fm.pkl_dirs[i].single_spectra[j];
if (ssf->num_peaks<5)
continue;
this->ssf_pointers.push_back((struct PKL_single *)ssf);
// check that this doesn't correpond to the immediate previous spec
if (remove_duplicates && ssf_pointers.size()>1)
{
const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];
if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.001) )
{
ssf_pointers.pop_back();
continue;
}
}
// cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
}
}
this->next_ssf_pointer=0;
}
/**********************************************************************
Selects dat ssfs that have an mzxml file idx of at most max_mzxml idx
***********************************************************************/
void FileSet::filter_dat_spectra_by_mzxml_idx(int max_mzxml_idx)
{
int i;
vector<SingleSpectrumFile *> new_ssfs;
for (i=0; i<ssf_pointers.size(); i++)
{
DAT_single *dat_ssf = (DAT_single *)ssf_pointers[i];
if (dat_ssf->mzxml_file_idx > max_mzxml_idx)
continue;
new_ssfs.push_back(ssf_pointers[i]);
}
ssf_pointers = new_ssfs;
}
/*************************************************************************
Select the file pointers from the FileManager
**************************************************************************/
void FileSet::select_files(const FileManager& fm,
mass_t min_pm_with_19,
mass_t max_pm_with_19,
score_t min_sqs,
score_t max_sqs,
int charge,
bool only_unassigned)
{
int i;
this->ssf_pointers.clear();
if (charge !=0)
{
min_charge = charge;
max_charge = charge;
}
for (i=0; i<fm.dta_files.size(); i++)
{
if (fm.dta_files[i].org_pm_with_19 < min_pm_with_19 ||
fm.dta_files[i].org_pm_with_19 > max_pm_with_19)
continue;
if (fm.dta_files[i].sqs>-1 &&
(fm.dta_files[i].sqs<min_sqs || fm.dta_files[i].sqs>max_sqs) )
continue;
if (charge>0 && fm.dta_files[i].charge != charge)
continue;
if (only_unassigned && fm.dta_files[i].assigned_cluster>=0)
continue;
if (fm.dta_files[i].charge<min_charge)
min_charge = fm.dta_files[i].charge;
if (fm.dta_files[i].charge>max_charge)
max_charge = fm.dta_files[i].charge;
ssf_pointers.push_back((struct DTA_file *)&fm.dta_files[i]);
}
for (i=0; i<fm.mgf_files.size(); i++)
{
int j;
for (j=0; j<fm.mgf_files[i].single_spectra.size(); j++)
{
if (fm.mgf_files[i].single_spectra[j].org_pm_with_19 < min_pm_with_19 ||
fm.mgf_files[i].single_spectra[j].org_pm_with_19 > max_pm_with_19)
continue;
if ( fm.mgf_files[i].single_spectra[j].sqs > -1 &&
(fm.mgf_files[i].single_spectra[j].sqs<min_sqs || fm.mgf_files[i].single_spectra[j].sqs>max_sqs))
continue;
if (charge>0 && fm.mgf_files[i].single_spectra[j].charge != charge)
continue;
if (only_unassigned && fm.mgf_files[i].single_spectra[j].assigned_cluster>=0)
continue;
if (fm.mgf_files[i].single_spectra[j].charge>max_charge)
max_charge = fm.mgf_files[i].single_spectra[j].charge;
if (fm.mgf_files[i].single_spectra[j].charge<min_charge)
min_charge = fm.mgf_files[i].single_spectra[j].charge;
this->ssf_pointers.push_back((struct MGF_single *)&fm.mgf_files[i].single_spectra[j]);
}
}
this->next_ssf_pointer=0;
}
/*************************************************************************
Select the file pointers from the FileManager
**************************************************************************/
void FileSet::select_files_in_mz_range(
const FileManager& fm,
mass_t min_mz,
mass_t max_mz,
int charge)
{
int i;
this->ssf_pointers.clear();
if (charge !=0)
{
min_charge = charge;
max_charge = charge;
}
for (i=0; i<fm.mgf_files.size(); i++)
{
int j;
for (j=0; j<fm.mgf_files[i].single_spectra.size(); j++)
{
if (fm.mgf_files[i].single_spectra[j].m_over_z < min_mz ||
fm.mgf_files[i].single_spectra[j].m_over_z > max_mz)
continue;
if (charge>0 && fm.mgf_files[i].single_spectra[j].charge != charge)
continue;
if (fm.mgf_files[i].single_spectra[j].charge>max_charge)
max_charge = fm.mgf_files[i].single_spectra[j].charge;
if (fm.mgf_files[i].single_spectra[j].charge<min_charge)
min_charge = fm.mgf_files[i].single_spectra[j].charge;
this->ssf_pointers.push_back((struct MGF_single *)&fm.mgf_files[i].single_spectra[j]);
}
}
this->next_ssf_pointer=0;
}
// Randomly select n ssf pointers
void FileSet::randomly_reduce_ssfs(int n)
{
if (n>= ssf_pointers.size())
return;
vector<int> idxs;
choose_k_from_n(n,ssf_pointers.size(),idxs);
vector<SingleSpectrumFile *> red_ssfs;
red_ssfs.resize(n,NULL);
int i;
for (i=0; i<n; i++)
red_ssfs[i] = ssf_pointers[idxs[i]];
ssf_pointers = red_ssfs;
}
struct mz_pair {
bool operator< (const mz_pair& other) const
{
return (m_over_z < other.m_over_z);
}
mass_t m_over_z;
SingleSpectrumFile *ssf;
};
void FileSet::sort_according_to_m_over_z()
{
vector<mz_pair> pairs;
pairs.resize(ssf_pointers.size());
int i;
for (i=0; i<ssf_pointers.size(); i++)
{
pairs[i].m_over_z=ssf_pointers[i]->m_over_z;
pairs[i].ssf = ssf_pointers[i];
}
sort(pairs.begin(),pairs.end());
for (i=0; i<pairs.size(); i++)
ssf_pointers[i]=pairs[i].ssf;
}
// removes all ssf without a peptides
void FileSet::keep_only_spectra_with_peptides()
{
int i;
vector<SingleSpectrumFile *> keep_ssfs;
keep_ssfs.clear();
for (i=0; i<ssf_pointers.size(); i++)
if (ssf_pointers[i]->peptide.get_num_aas()>3)
keep_ssfs.push_back(ssf_pointers[i]);
ssf_pointers = keep_ssfs;
}
bool comp_ssf_pointers(const SingleSpectrumFile *ss1,
const SingleSpectrumFile *ss2)
{
return ( (ss1->file_idx < ss2->file_idx) ||
(ss1->file_idx == ss2->file_idx && ss1->file_pos < ss2->file_pos));
}
// copies the ssf pointers from another FileSet
// sorts them according to the file order (not m_over_z);
void FileSet::init_from_another_fs(const FileSet& other_fs,
int start_ssf_idx, int end_ssf_idx)
{
int i;
reset_pointers();
ssf_pointers.clear();
ssf_pointers.reserve(end_ssf_idx-start_ssf_idx+1);
int not_added=0;
for (i=start_ssf_idx; i<=end_ssf_idx; i++)
if (other_fs.ssf_pointers[i]->assigned_cluster<0)
{
ssf_pointers.push_back(other_fs.ssf_pointers[i]);
}
// else
// not_added++;
// if (not_added>0)
// cout << "Not added: " << not_added << endl;
// the comp_ssf_pointers is used so the ssfs are addressed according to
// the order in which they appear in the list file
// if this causes problems can use regular sort according to pointer value
// this will cause the files to be accessed in an unpredictable order (however
// the spectra in the files will be accessed in the correct order)
sort(ssf_pointers.begin(),ssf_pointers.end(),comp_ssf_pointers);
}
/************************************************************
// reads the next spectrum into spec
// returns false if no more spectra are available
// returns the mgf_pointer through mp, if file = -1 this means
// this was a dta
*************************************************************/
bool FileSet::get_next_spectrum(const FileManager& fm,
Config *config,
Spectrum *spec,
SingleSpectrumFile **ssfp,
bool perform_init_spectrum,
bool set_charge_to_zero)
{
SingleSpectrumFile *ssf;
if (next_ssf_pointer < ssf_pointers.size())
{
ssf = ssf_pointers[next_ssf_pointer++];
if (ssfp)
*ssfp=ssf;
if (ssf->type == DTA)
{
if (set_charge_to_zero)
{
spec->read_from_dta(config,ssf->single_name.c_str(),0);
}
else
spec->read_from_dta(config,ssf->single_name.c_str());
if (perform_init_spectrum)
spec->init_spectrum();
return true;
}
else if (ssf->type == MGF)
{
if (this->current_mgf_file_idx != ssf->file_idx)
{
if (mgf_stream)
fclose(mgf_stream);
mgf_stream=fopen(fm.mgf_files[ssf->file_idx].mgf_name.c_str(),"r");
if (! mgf_stream)
{
cout << "Error: Couldn't open file for reading: " <<
fm.mgf_files[ssf->file_idx].mgf_name << endl;
exit(1);
}
this->current_mgf_file_idx = ssf->file_idx;
}
if ( fseek(mgf_stream,ssf->file_pos,0) )
{
cout << "Error: could not skip in file!" << endl;
exit(1);
}
spec->read_from_MGF_stream(config,mgf_stream);
if (perform_init_spectrum)
spec->init_spectrum();
return true;
}
else
{
cout << "Error: invalid file type:" << ssf->type << endl;
exit(1);
}
}
else
return false;
}
inline bool comp_ssf(const SingleSpectrumFile* ssf1, const SingleSpectrumFile* ssf2)
{
return ( ssf1->file_idx < ssf2->file_idx ||
( (ssf1->file_idx == ssf2->file_idx) && (ssf1->file_pos<ssf2->file_pos) ) );
}
// creates an mgf file with the desred number of spectra per charge as designated
// in the spectra_per_charges vector(strating from charge 0). so the maximum
// number of from each charge in the outputted in the mgf file will be at most
// the numbers desginated in the vector
void FileSet::create_mgf_file(const FileManager& fm, Config *config, const char *file_name,
vector<int> spectra_per_charges)
{
int max_charge = spectra_per_charges.size()-1;
vector<int> spectra_written;
spectra_written.resize(max_charge+1,0);
ofstream ofs(file_name,ios::out);
if (! ofs.good())
{
cout << "Error: couldn't open file for writing: " << file_name << endl;
exit(1);
}
this->reset_pointers();
while (1 )
{
Spectrum s;
if (! this->get_next_spectrum(fm,config,&s))
break;
// s.set_charge(0);
int spec_charge = s.get_charge();
if (spec_charge>max_charge)
continue;
if (spectra_written[spec_charge]>=spectra_per_charges[spec_charge])
continue;
spectra_written[spec_charge]++;
s.output_as_MGF(ofs);
}
ofs.close();
}
/*************************************************************
// iterates over the files and makes a fasta out of the seq
// puts 10 in a row, calls it TRUE_X
uses the org_aa
**************************************************************/
void FileSet::make_fasta_from_file_seqs(const FileManager& fm, Config *config,
int inc, ostream& os)
{
int seq_counter=0;
int line_counter=0;
Spectrum s;
while ( 1 )
{
if ( ! this->get_next_spectrum(fm,config,&s))
break;
if (s.get_peptide().get_num_aas()>0)
{
if (seq_counter == 0)
os << ">TRUE_" << line_counter*inc << endl;
Peptide p = s.get_peptide();
p.convert_to_org(config);
os << p.as_string(config);
seq_counter++;
if (seq_counter == inc)
{
os << endl;
seq_counter=0;
line_counter++;
}
}
}
os << endl;
}
// concatonates several mgf files into one large file
// slow and stupid...
void concat_mgf_files(Config *config, const char *mgf_file_list,
const char *big_mgf_file)
{
int i;
ofstream ofs(big_mgf_file,ios::out);
if (! ofs.good() )
{
cout << "Error: couldn't open file for writing: " << big_mgf_file << endl;
exit(1);
}
vector<string> list;
read_paths_into_list(mgf_file_list,list);
for (i=0; i<list.size(); i++)
{
Spectrum s;
FILE *mgf_stream = fopen(list[i].c_str(),"r");
if (mgf_stream)
{
cout << "Error: couldn't open file for reading: " << list[i] << endl;
exit(1);
}
while ( s.read_from_MGF_stream(config,mgf_stream) )
{
// s.init_spectrum(); // so we filter peaks
s.output_as_MGF(ofs);
}
fclose(mgf_stream);
}
ofs.close();
}
/****************************
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -