📄 denovosolutions.cpp
字号:
{
vector<SeqPath> alt_solutions;
generate_tags(prm_ptrs,&model,bs,&s, num_tags, main_length,
alt_pms_with_19, alt_charges, alt_solutions, false, alt_solutions.size());
int j;
for (j=0; j<alt_solutions.size(); j++)
solutions.push_back(alt_solutions[j]);
}
if (solutions.size() == 0)
{
ssf->print_ssf_stats(config,out_stream);
out_stream << " - no tags found" << endl;
no_tags++;
continue;
}
output_tag_solutions(ssf,config,out_stream,solutions);
if (ssf->peptide.get_num_aas()>2)
{
num_benchmark++;
int idx;
for (idx=0; idx<solutions.size() && num_solutions; idx++)
if (solutions[idx].check_if_correct(ssf->peptide.as_string(config),config))
{
correct_benchmark++;
break;
}
}
scan_count++;
}
}
if (report_progress)
{
time_t curr_time = time(NULL);
double elapsed_time = (curr_time - last_time);
cout << "Total running time: " << elapsed_time << endl;
}
if (num_benchmark>0)
{
cout << endl << "Correct " << correct_benchmark << "/" << num_benchmark << " (" <<
setprecision(3) << fixed << (float)correct_benchmark/num_benchmark << ")" << endl;
}
}
// makes a FASTA file with the sequences of full denovo sequences (completed
// from the SEQ in the annotated mgf file)
void make_denovo_training_fa(AdvancedScoreModel& model,
char *spectrum_file)
{
Config *config = model.get_config();
FileManager fm;
FileSet fs;
// Quick read, get all pointers to begining of spectra
if (get_file_extension_type(spectrum_file) != MZXML)
{
fm.init_from_file(config,spectrum_file);
}
else // reads peaks
fm.init_and_read_single_mzXML(config,spectrum_file,0);
fs.select_all_files(fm);
cout << "Generating fa for: " << spectrum_file << endl;
cout << "Scans: " << fs.get_total_spectra() << endl;
string file_name,fa_file;
get_file_name_without_extension(spectrum_file,file_name);
fa_file = file_name + ".dnv.fa";
ofstream fa_stream(fa_file.c_str());
time_t start_time;
start_time = time(NULL);
vector<PrmGraph *> prm_ptrs;
BasicSpecReader bsr;
int too_few_peaks=0;
int bad_charge=0;
int no_sols=0;
int scan_count=0;
prm_ptrs.resize(50,NULL);
///////////////////////////////////////////////
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
static vector<QCPeak> peaks;
SingleSpectrumFile *ssf = all_ssf[sc];
if (peaks.size()<ssf->num_peaks)
{
int new_size = ssf->num_peaks*2;
if (new_size<2500)
new_size=2500;
peaks.resize(new_size);
}
// if (ssf->get_scan()<11254)
// continue;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
ssf->file_idx = 0;
if (num_peaks<10)
{
too_few_peaks++;
continue;
}
Spectrum s;
s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
if ( ssf->charge > model.get_max_score_model_charge())
{
bad_charge++;
continue;
}
vector<SeqPath> seqpath_solutions;
vector<int> charges;
vector<mass_t> pms_with_19;
pms_with_19.clear();
charges.clear();
BasicSpectrum bs;
bs.ssf = ssf;
bs.peaks = &peaks[0];
bs.num_peaks = num_peaks;
Peptide correct_peptide = ssf->peptide;
// output m/z and prob values for the different charge states
vector<PmcSqsChargeRes> pmcsqs_res;
model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);
cout << "Scan " << bs.ssf->get_scan() << ",\tch: " << charges[0] << setprecision(1) << "\tpm19: " <<
pms_with_19[0] << setprecision(3) << " \tSQS: " << pmcsqs_res[charges[0]].sqs_prob;
if (pmcsqs_res[charges[0]].sqs_prob<0.01 && ! config->get_use_spectrum_charge())
{
// cout << " - skipping..." << endl;
// continue;
}
pms_with_19.clear();
charges.clear();
pms_with_19.push_back(s.get_true_mass_with_19());
charges.push_back(s.get_charge());
generate_denovo_solutions_from_several_pms_with_good_start_end_idxs(prm_ptrs,
&model,&s,true,150,6,14,pms_with_19,charges,seqpath_solutions);
int j;
for (j=0; j<seqpath_solutions.size(); j++)
{
const int num_correct_aas = seqpath_solutions[j].get_num_correct_aas(correct_peptide,config);
if (num_correct_aas==seqpath_solutions[j].get_num_aa())
{
seqpath_solutions[j]=seqpath_solutions[seqpath_solutions.size()-1];
seqpath_solutions.pop_back();
}
}
vector<PeptideSolution> pep_solutions;
if (seqpath_solutions.size()>0)
{
// create peptide solutions
pep_solutions.resize(seqpath_solutions.size());
int s_idx;
for (s_idx=0; s_idx<seqpath_solutions.size(); s_idx++)
{
convert_seq_path_to_peptide_soluition_and_fill_in_aas(config,
correct_peptide, seqpath_solutions[s_idx],pep_solutions[s_idx]);
}
}
else
continue;
if (seqpath_solutions.size() == 0)
{
cout << " - no sols" << endl;
no_sols++;
continue;
}
fa_stream << ">" << file_name << ":" << ssf->get_scan() << endl;
// sample de novo sequences
int i;
for (i=9; i<pep_solutions.size(); i+=10)
{
int idx = int(my_random()*i);
pep_solutions[idx].pep.convert_to_org(config);
fa_stream << pep_solutions[idx].pep.as_string(config);
pep_solutions[idx].num_correct_aas=pep_solutions[idx].pep.calc_number_of_correct_aas(config,correct_peptide);
cout << " " << idx << "(" << pep_solutions[idx].num_correct_aas << ")";
}
cout << endl;
fa_stream << endl;
scan_count++;
// cout << " " << seqpath_solutions.size() << " sols." << endl;
if (sc % 200 == 0)
{
time_t curr_time = time(NULL);
cout << setprecision(1) << fixed;
cout << "done " << sc << "/" << all_ssf.size() << " " << curr_time - start_time << " secs., "
<< scan_count << " scans completed." << endl;
}
}
fa_stream.close();
cout << "Done..." << endl;
cout << "fa file: " << fa_file << endl;
cout << "Wrote fa for " << scan_count << " spectra." << endl;
if (bad_charge>0)
cout << "Scans with bad charges : " << bad_charge << endl;
if (too_few_peaks>0)
cout << "Scans with too few peaks: " << too_few_peaks << endl;
if (no_sols>0)
cout << "Scans for which no tags were found: " << no_sols << endl;
}
void benchmark_tags(AdvancedScoreModel& model,
char *list,
char *tag_string,
int num_test_cases)
{
Config *config = model.get_config();
FileManager fm;
FileSet fs;
vector<int> num_tags; // how many tags from each length
int main_length;
parse_tag_string(tag_string,main_length,num_tags);
fm.init_from_list_file(config,list);
fs.select_all_files(fm);
cout << "benchmarking tags for : " << list << endl;
cout << "Scans : " << fs.get_total_spectra() << endl;
cout << "Tag types : " << tag_string << endl;
if (num_test_cases<0)
{
num_test_cases=fs.get_total_spectra();
}
else
{
fs.randomly_reduce_ssfs(num_test_cases);
}
cout << "Will try to get " << num_test_cases << " test cases" << endl;
time_t start_time;
start_time = time(NULL);
vector<PrmGraph *> prm_ptrs;
BasicSpecReader bsr;
int too_few_peaks=0;
int bad_charge=0;
int no_tags=0;
int scan_count=0;
vector<int> first_correct_len;
vector<int> correct_ranks;
int num_spectra_tested=0;
int had_correct =0;
first_correct_len.resize(10,0);
prm_ptrs.resize(50,NULL);
///////////////////////////////////////////////
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
static vector<QCPeak> peaks;
SingleSpectrumFile *ssf = all_ssf[sc];
string peptide_str = ssf->peptide.as_string(config);
if (peaks.size()<ssf->num_peaks)
{
int new_size = ssf->num_peaks*2;
if (new_size<2500)
new_size=2500;
peaks.resize(new_size);
}
vector<int> tweak_charges;
vector<mass_t> tweak_pms_with_19;
tweak_charges.resize(6,0);
tweak_pms_with_19.resize(6,0);
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
ssf->file_idx = 0;
if (num_peaks<10)
{
too_few_peaks++;
continue;
}
Spectrum s;
s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
// s.print_spectrum();
if ( ssf->charge > model.get_max_score_model_charge())
{
bad_charge++;
continue;
}
vector<SeqPath> solutions;
vector<mass_t> pms_with_19, alt_pms_with_19;
vector<int> charges, alt_charges;
pms_with_19.clear();
charges.clear();
BasicSpectrum bs;
bs.ssf = ssf;
bs.peaks = &peaks[0];
bs.num_peaks = num_peaks;
// output m/z and prob values for the different charge states
vector<PmcSqsChargeRes> pmcsqs_res;
model.select_pms_and_charges(config,bs,pms_with_19,charges,true,&pmcsqs_res);
if (pmcsqs_res[charges[0]].sqs_prob<0.01)
{
// continue;
}
cout << sc << "\t" << num_peaks << "\t" << setprecision(2) << pmcsqs_res[charges[0]].sqs_prob << "\t" <<
ssf->peptide.as_string(config) << " (" << ssf->peptide.get_mass_with_19() - s.get_org_pm_with_19() << ")" << endl;
alt_charges.clear();
alt_pms_with_19.clear();
while (charges.size()>0 && charges[0] != charges[charges.size()-1])
{
alt_charges.push_back(charges[charges.size()-1]);
charges.pop_back();
alt_pms_with_19.push_back(pms_with_19[pms_with_19.size()-1]);
pms_with_19.pop_back();
}
if (pms_with_19.size()>0)
{
generate_tags(prm_ptrs,&model,bs,&s,num_tags, main_length,
pms_with_19,charges,solutions);
}
if (alt_pms_with_19.size()>0)
{
vector<SeqPath> alt_solutions;
generate_tags(prm_ptrs,&model,bs,&s, num_tags, main_length,
alt_pms_with_19, alt_charges, alt_solutions, false, alt_solutions.size());
int j;
for (j=0; j<alt_solutions.size(); j++)
solutions.push_back(alt_solutions[j]);
}
num_spectra_tested++;
if (num_test_cases==num_spectra_tested)
break;
if (solutions.size() == 0)
{
no_tags++;
continue;
}
int j;
for (j=0; j<solutions.size(); j++)
if (solutions[j].check_if_correct(peptide_str,config))
{
had_correct++;
first_correct_len[solutions[j].get_num_aa()]++;
correct_ranks.push_back(j);
break;
}
if (sc % 200 == 0)
{
time_t curr_time = time(NULL);
cout << setprecision(1) << fixed;
cout << "done " << sc << "/" << all_ssf.size() << " " << curr_time - start_time << " secs., "
<< scan_count << " scans completed." << endl;
}
}
cout << "Tested " << num_spectra_tested << " spectra" << endl;
if (bad_charge>0)
cout << "Scans with bad charges : " << bad_charge << endl;
if (too_few_peaks>0)
cout << "Scans with too few peaks: " << too_few_peaks << endl;
if (no_tags>0)
cout << "Scans for which no tags were found: " << no_tags << endl;
cout << "% with correct " << setprecision(3) << (float)had_correct/(float)num_spectra_tested << endl;
cout << "First length breakdown: " << endl;
int j;
for (j=1; j<first_correct_len.size(); j++)
if (first_correct_len[j]>0)
cout << j << "\t" << (float)first_correct_len[j]/(float)num_spectra_tested << endl;
cout << endl;
// 1 3 5 10 25 50 100
vector<int> counts,seps;
seps.push_back(1);
seps.push_back(3);
seps.push_back(5);
seps.push_back(10);
seps.push_back(25);
seps.push_back(50);
seps.push_back(100);
seps.push_back(250);
seps.push_back(500);
counts.resize(seps.size(),0);
for (j=0; j<correct_ranks.size(); j++)
{
int k;
for (k=0; k<seps.size(); k++)
if (correct_ranks[j]<seps[k])
counts[k]++;
}
for (j=0; j<seps.size(); j++)
{
cout << seps[j] << "\t" << counts[j]/(float)all_ssf.size() << endl;
}
cout << endl;
}
void perform_denovo_on_list_of_files(AdvancedScoreModel& model,
const vector<string>& list_vector,
int file_start_idx,
int num_solutions,
int min_length,
int max_length,
bool report_progress,
ostream& out_stream)
{
int num_rerank_sols = 200;
Config *config = model.get_config();
int correct_benchmark=0;
int total_benchmark=0;
int spec_counter=0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -