📄 denovosolutions.cpp
字号:
if (solutions.size() == 0)
{
out_stream << "No solutions found." << endl;
}
else
{
out_stream << "#Index\tRnkScr\tPnvScr\tN-Gap\tC-Gap\t[M+H]\tCharge\tSequence" << endl;
int i;
for (i=0; i<solutions.size(); i++)
{
mass_t c_gap=solutions[i].pm_with_19 - solutions[i].c_term_mass;
if (c_gap<24.0)
c_gap = 0;
out_stream << setprecision(3) << fixed << i << "\t";
out_stream << solutions[i].rerank_score << "\t";
out_stream << solutions[i].path_score << "\t";
out_stream << solutions[i].n_term_mass << "\t";
out_stream << c_gap << "\t";
out_stream << solutions[i].pm_with_19 << "\t";
out_stream << solutions[i].charge << "\t";
out_stream << solutions[i].seq_str;
out_stream << endl;
}
}
out_stream << endl;
}
void parse_tag_string(char *tag_string, int& main_length, vector<int>& num_tags)
{
num_tags.clear();
num_tags.resize(10,0);
string ts(tag_string);
int i;
for (i=0; i<ts.length(); i++)
if (ts[i]==':')
ts[i]=' ';
istringstream iss(ts);
int len=0;
while (iss>>len)
{
if (len<2 || len>10)
{
cout << "Error: bad string for tag lengths, should be something like \"4:20:5:30\""<< endl;
exit(1);
}
int n=0;
iss >> n;
if (n<0 || n>1000)
{
cout << "Error: number of tags for length " << len << " should be 1-1000" << endl;
exit(1);
}
num_tags[len]=n;
}
main_length=0;
for (i=0; i<10; i++)
if (num_tags[i]>=5)
{
main_length=i;
break;
}
if (main_length<=0)
{
for (i=0; i<10; i++)
if (num_tags[i]>0)
{
main_length=i;
break;
}
}
if (main_length<=0)
{
cout << "Error parsing tag_string: " << tag_string << endl;
exit(1);
}
}
/****************************************************************
The input uses a tag string which has the format
4:20:5:30 - meaning 20 tags of length 4 and 30 tags of length 5
Generate a text file containing tags for a whole spectrum.
The format is:
<scan number> <number tags>
tweak 0 (each tweak is charge pm_with_19)
..
tweak 5
tag 0 (tweak_idx score num_aas n_term mass AASeq)
..
tag n-1
*****************************************************************/
void create_tag_file_for_inspect(AdvancedScoreModel& model,
char *spectrum_file,
char *tag_string,
char *tag_suffix)
{
const int max_tweak_charge =3;
Config *config = model.get_config();
FileManager fm;
FileSet fs;
vector<int> num_tags; // how many tags from each length
int main_length;
parse_tag_string(tag_string,main_length,num_tags);
// Quick read, get all pointers to begining of spectra
if (get_file_extension_type(spectrum_file) != MZXML)
{
fm.init_from_file(config,spectrum_file);
}
else // reads peaks
fm.init_and_read_single_mzXML(config,spectrum_file,0);
fs.select_all_files(fm);
cout << "Generating tags for: " << spectrum_file << endl;
cout << "Scans: " << fs.get_total_spectra() << endl;
string file_name,tag_file;
get_file_name_without_extension(spectrum_file,file_name);
tag_file = file_name + "_" + string(tag_suffix) + ".txt";
ofstream tag_stream(tag_file.c_str());
time_t start_time;
start_time = time(NULL);
vector<PrmGraph *> prm_ptrs;
BasicSpecReader bsr;
int too_few_peaks=0;
int bad_charge=0;
int no_tags=0;
int scan_count=0;
prm_ptrs.resize(50,NULL);
///////////////////////////////////////////////
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
static vector<QCPeak> peaks;
SingleSpectrumFile *ssf = all_ssf[sc];
if (peaks.size()<ssf->num_peaks)
{
int new_size = ssf->num_peaks*2;
if (new_size<2500)
new_size=2500;
peaks.resize(new_size);
}
// if (ssf->get_scan()<11254)
// continue;
vector<int> tweak_charges;
vector<mass_t> tweak_pms_with_19;
tweak_charges.resize(6,0);
tweak_pms_with_19.resize(6,0);
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
ssf->file_idx = 0;
if (num_peaks<10)
{
too_few_peaks++;
continue;
}
Spectrum s;
s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
if ( ssf->charge > model.get_max_score_model_charge())
{
bad_charge++;
continue;
}
vector<SeqPath> solutions;
vector<mass_t> pms_with_19, alt_pms_with_19;
vector<int> charges, alt_charges;
pms_with_19.clear();
charges.clear();
BasicSpectrum bs;
bs.ssf = ssf;
bs.peaks = &peaks[0];
bs.num_peaks = num_peaks;
// output m/z and prob values for the different charge states
vector<PmcSqsChargeRes> pmcsqs_res;
model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);
cout << "Scan " << bs.ssf->get_scan() << ",\tch: " << charges[0] << setprecision(1) << "\tpm19: " <<
pms_with_19[0] << setprecision(3) << " \tSQS: " << pmcsqs_res[charges[0]].sqs_prob;
if (config->get_filter_flag() &&
! config->get_use_spectrum_charge() &&
pmcsqs_res[charges[0]].sqs_prob<0.01)
{
cout << " - skipping..." << endl;
continue;
}
alt_charges.clear();
alt_pms_with_19.clear();
while (charges.size()>0 && charges[0] != charges[charges.size()-1])
{
alt_charges.push_back(charges[charges.size()-1]);
charges.pop_back();
alt_pms_with_19.push_back(pms_with_19[pms_with_19.size()-1]);
pms_with_19.pop_back();
}
if (pms_with_19.size()>0)
{
generate_tags(prm_ptrs,&model,bs,&s,num_tags, main_length,
pms_with_19,charges,solutions);
// set tweaks
int charge = charges[0];
if (charge>max_tweak_charge)
{
cout << "Error: max allowed tweak charge is " << max_tweak_charge << ", selected charge " << charge << endl;
exit(1);
}
tweak_charges[2*(charge-1)]=charge;
tweak_pms_with_19[2*(charge-1)]=pms_with_19[0];
if (pms_with_19.size()>1)
{
tweak_charges[2*(charge-1)+1]=charge;
tweak_pms_with_19[2*(charge-1)+1]=pms_with_19[1];
}
}
if (alt_pms_with_19.size()>0)
{
vector<SeqPath> alt_solutions;
generate_tags(prm_ptrs,&model,bs,&s, num_tags, main_length,
alt_pms_with_19, alt_charges, alt_solutions, false, alt_solutions.size());
int j;
for (j=0; j<alt_solutions.size(); j++)
solutions.push_back(alt_solutions[j]);
// set tweaks
int charge = alt_charges[0];
if (charge>max_tweak_charge)
{
cout << "Error: max allowed tweak charge is " << max_tweak_charge << ", selected charge " << charge << endl;
exit(1);
}
tweak_charges[2*(charge-1)]=charge;
tweak_pms_with_19[2*(charge-1)]=alt_pms_with_19[0];
if (alt_pms_with_19.size()>1)
{
tweak_charges[2*(charge-1)+1]=charge;
tweak_pms_with_19[2*(charge-1)+1]=alt_pms_with_19[1];
}
}
if (solutions.size() == 0)
{
cout << " - no tags" << endl;
no_tags++;
continue;
}
tag_stream << fixed << setprecision(3);
tag_stream << ssf->get_scan() << "\t" << solutions.size() << endl;
int i;
for (i=0; i<tweak_charges.size(); i++)
tag_stream << tweak_charges[i] << "\t" << tweak_pms_with_19[i] << endl;
for (i=0; i<solutions.size(); i++)
{
const float score = ( solutions[i].rerank_score>-200 ? solutions[i].rerank_score : solutions[i].path_score);
const int charge = solutions[i].charge;
int best_tweak_idx=2*(charge-1);
if (tweak_charges[2*(charge-1)+1]>0 &&
fabs(tweak_pms_with_19[2*(charge-1)+1]-solutions[i].pm_with_19)<
fabs(tweak_pms_with_19[2*(charge-1)]-solutions[i].pm_with_19))
best_tweak_idx = 2*(charge-1)+1;
tag_stream << best_tweak_idx << "\t" << setprecision(2) << score << "\t" << setprecision(3) << solutions[i].n_term_mass << "\t" << solutions[i].seq_str << endl;
}
scan_count++;
cout << " " << solutions.size() << " tags." << endl;
if (sc % 200 == 0)
{
time_t curr_time = time(NULL);
cout << setprecision(1) << fixed;
cout << "done " << sc << "/" << all_ssf.size() << " " << curr_time - start_time << " secs., "
<< scan_count << " scans completed." << endl;
}
}
tag_stream.close();
cout << "Done..." << endl;
cout << "Tag file: " << tag_file << endl;
cout << "Wrote tags for " << scan_count << " spectra." << endl;
if (bad_charge>0)
cout << "Scans with bad charges : " << bad_charge << endl;
if (too_few_peaks>0)
cout << "Scans with too few peaks: " << too_few_peaks << endl;
if (no_tags>0)
cout << "Scans for which no tags were found: " << no_tags << endl;
}
void perform_tags_on_list_of_files(AdvancedScoreModel& model,
const vector<string>& list_vector,
int file_start_idx,
int num_solutions,
int tag_length,
bool report_progress,
ostream& out_stream)
{
Config *config = model.get_config();
vector<int> num_tags; // how many tags from each length
int main_length;
num_tags.resize(10,0);
num_tags[tag_length]=num_solutions;
main_length = tag_length;
// Quick read, get all pointers to begining of spectra
time_t start_time, last_time;
start_time = time(NULL);
last_time = start_time;
vector<PrmGraph *> prm_ptrs;
BasicSpecReader bsr;
int too_few_peaks=0;
int bad_charge=0;
int no_tags=0;
int scan_count=0;
int correct_benchmark=0;
int num_benchmark=0;
prm_ptrs.resize(50,NULL);
int f_idx;
for (f_idx=0; f_idx<list_vector.size(); f_idx++)
{
FileManager fm;
FileSet fs;
const string spectrum_file = list_vector[f_idx];
if (get_file_extension_type(spectrum_file) != MZXML)
{
fm.init_from_file(config,spectrum_file.c_str());
}
else // reads peaks
fm.init_and_read_single_mzXML(config,spectrum_file.c_str(),0);
fs.select_all_files(fm);
///////////////////////////////////////////////
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
static vector<QCPeak> peaks;
SingleSpectrumFile *ssf = all_ssf[sc];
if (peaks.size()<ssf->num_peaks)
{
int new_size = ssf->num_peaks*2;
if (new_size<2500)
new_size=2500;
peaks.resize(new_size);
}
time_t curr_time = time(NULL);
double elapsed_time = (curr_time - last_time);
if (report_progress && elapsed_time>5)
{
last_time = curr_time;
cout << "Processing file " << f_idx+1 << "/" << list_vector.size();
if (all_ssf.size() == 1)
{
cout << " spectrum 1/1 of current file." << endl;
}
else
cout << " spectrum " << sc+1 << "/" << all_ssf.size() <<
" of current file." << endl;
last_time=curr_time;
}
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
ssf->file_idx = f_idx+file_start_idx;
// convert peak list ot a spectrum with charge (if original charge ==0)
// the spectrum gets charge 2, but the true charge is computed from the data
if (num_peaks<5)
{
ssf->print_ssf_stats(config,out_stream);
out_stream << "# too few peaks..." << endl;
continue;
}
Spectrum s;
s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
if ( ssf->charge > model.get_max_score_model_charge())
{
bad_charge++;
continue;
}
vector<SeqPath> solutions;
vector<mass_t> pms_with_19, alt_pms_with_19;
vector<int> charges, alt_charges;
pms_with_19.clear();
charges.clear();
BasicSpectrum bs;
bs.ssf = ssf;
bs.peaks = &peaks[0];
bs.num_peaks = num_peaks;
// output m/z and prob values for the different charge states
vector<PmcSqsChargeRes> pmcsqs_res;
model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);
if (! config->get_use_spectrum_charge() &&
config->get_filter_flag() &&
pmcsqs_res[charges[0]].sqs_prob<0.01)
{
ssf->print_ssf_stats(config,out_stream);
out_stream << " - low quality, skipping..." << endl;
continue;
}
alt_charges.clear();
alt_pms_with_19.clear();
while (charges.size()>0 && charges[0] != charges[charges.size()-1])
{
alt_charges.push_back(charges[charges.size()-1]);
charges.pop_back();
alt_pms_with_19.push_back(pms_with_19[pms_with_19.size()-1]);
pms_with_19.pop_back();
}
if (pms_with_19.size()>0)
{
generate_tags(prm_ptrs,&model,bs,&s,num_tags, main_length,
pms_with_19,charges,solutions);
}
if (alt_pms_with_19.size()>0)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -