📄 denovorankscore.cpp
字号:
{
if (min_denovo_correct_rank<0)
min_denovo_correct_rank=s_idx;
solutions[s_idx].is_correct = true;
}
else
solutions[s_idx].is_correct = false;
if (solutions[s_idx].check_if_cut_correct(exp_cut_masses, tolerance))
{
if (min_denovo_cut_correct_rank == POS_INF)
min_denovo_cut_correct_rank=s_idx;
solutions[s_idx].is_cut_correct = true;
}
else
solutions[s_idx].is_correct = false;
solutions[s_idx].org_rank = s_idx;
}
// only include spectra that have a good path in the top xxx
// if (min_denovo_correct_rank==NEG_INF)
// continue;
// if the correct path is not in the solutions, look for one in the prm graphs and add it
if (min_denovo_correct_rank==NEG_INF)
{
cout << endl << spec_idx << " skipped..." << endl;
if (solutions.size()>0)
{
total_pred_rerank[0]+=solutions[0].get_num_aa();
total_pred_denovo[0]+=solutions[0].get_num_aa();
int corr=solutions[0].get_num_correct_aas(ssf->peptide,config);
total_corr_rerank[0]+=corr;
total_corr_denovo[0]+=corr;
}
continue;
}
cout << endl << spec_idx << "\tDNV rank: " << min_denovo_correct_rank << endl;
DeNovoRankScorer *drs = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
vector<score_pair> scores;
drs->score_denovo_sequences(solutions,ssf,peaks,bs.num_peaks,scores,-1);
sort(scores.begin(),scores.end());
int min_rerank_correct=NEG_INF;
int min_rerank_cut_correct=POS_INF;
int j;
for (j=0; j<scores.size(); j++)
{
int org_rank = scores[j].idx;
if (min_rerank_correct<0 && solutions[org_rank].is_correct)
{
min_rerank_correct = j;
}
if (min_rerank_cut_correct==POS_INF && solutions[org_rank].is_cut_correct)
{
min_rerank_cut_correct = j;
}
if (j<10 || min_rerank_correct == j)
{
cout << j << "\t" << org_rank << "\t" << scores[j].score << endl;
}
}
cout << "Best ranks: Dnv " << min_denovo_correct_rank << " Rnk " << min_rerank_correct << endl;
cout << endl;
if (min_rerank_correct == NEG_INF)
min_rerank_correct = POS_INF;
if (min_rerank_cut_correct > min_rerank_correct)
min_rerank_cut_correct = min_rerank_correct;
if (min_denovo_correct_rank == NEG_INF)
min_denovo_correct_rank = POS_INF;
if (min_denovo_cut_correct_rank > min_denovo_correct_rank)
min_denovo_cut_correct_rank = min_denovo_correct_rank;
top_denovo[0].push_back(min_denovo_correct_rank);
top_rerank[0].push_back(min_rerank_correct);
top_denovo[1].push_back(min_denovo_cut_correct_rank);
top_rerank[1].push_back(min_rerank_cut_correct);
if (min_denovo_correct_rank==0)
num_top_denovo_correct++;
if (min_rerank_correct==0)
num_top_rerank_correct++;
int num_corr_denovo=solutions[0].get_num_correct_aas(ssf->peptide,config);
int num_pred_denovo=solutions[0].get_num_aa();
int num_corr_rerank = solutions[scores[0].idx].get_num_correct_aas(ssf->peptide,config);
int num_pred_rerank = solutions[scores[0].idx].get_num_aa();
total_corr_denovo[0] += num_corr_denovo;
total_pred_denovo[0] += num_pred_denovo;
total_corr_rerank[0] += num_corr_rerank;
total_pred_rerank[0] += num_pred_rerank;
num_spec_tested++;
if (num_spec_tested % 10 == 0)
{
double curr_t = time(NULL);
sum_stream << num_spec_tested << "\t" << curr_t - start_t << "\t" << fixed <<
setprecision(3) << (float)num_top_denovo_correct/spec_idx <<
"\t" << num_top_rerank_correct/(float)spec_idx << "\t(" << spec_idx << ")" << endl;
}
if (num_spec_tested == max_num_spectra)
break;
}
vector<int> b_vals;
b_vals.push_back(1);
b_vals.push_back(2);
b_vals.push_back(5);
b_vals.push_back(10);
b_vals.push_back(20);
b_vals.push_back(50);
b_vals.push_back(100);
b_vals.push_back(200);
b_vals.push_back(500);
b_vals.push_back(1000);
b_vals.push_back(2000);
b_vals.push_back(5000);
b_vals.push_back(10000);
int b=b_vals.size()-1;
while (b>0 && b_vals[b]>=num_solutions)
{
b_vals.pop_back();
b--;
}
b_vals.push_back(num_solutions);
int rep;
for (rep=0; rep<1; rep++)
{
if (rep==0)
{
sum_stream << endl << "De novo correctness results" ;
}
else
{
sum_stream << "Cut correctness results";
}
sum_stream << " for " << fixed << setprecision(0) << mgf_test_file << endl << endl;
vector<int> d_counts,r_counts;
r_counts.resize(b_vals.size()+1,0);
d_counts.resize(b_vals.size()+1,0);
int i;
for (i=0; i<top_denovo[rep].size(); i++)
{
int b;
int d_rank = top_denovo[rep][i];
for (b=0; b<b_vals.size(); b++)
if (d_rank<b_vals[b])
break;
int k;
for (k=d_counts.size()-1; k>=b; k--)
d_counts[k]++;
int r_rank = top_rerank[rep][i];
for (b=0; b<b_vals.size(); b++)
if (r_rank<b_vals[b])
break;
for (k=d_counts.size()-1; k>=b; k--)
r_counts[k]++;
}
// const double total = top_denovo[rep].size();
const double total = all_ssf.size();
sum_stream << endl << "results for " << setprecision(0) << total << " test spectra" << endl;
for (i=0; i<b_vals.size(); i++)
{
sum_stream << setprecision(0) << b_vals[i] << "\t" << setprecision(3) << d_counts[i] << "\t" << (double)d_counts[i]/total << "\t" <<
r_counts[i] << "\t" << (double)r_counts[i]/total << endl;
}
sum_stream << endl;
if (rep == 0)
{
sum_stream << "reg denovo: aa correct " << 100*total_corr_denovo[rep]/total_pred_denovo[rep] << "% ( avg pred " <<
total_pred_denovo[rep] / total << " aa)" << endl;
sum_stream << "rank denovo: aa correct " << 100*total_corr_rerank[rep]/total_pred_rerank[rep]<< "% ( avg pred " <<
total_pred_rerank[rep]/ total << " aa)" << endl;
}
for (i=0; i<b_vals.size(); i++)
sum_stream << b_vals[i] <<"\t";
sum_stream << endl;
for (i=0; i<b_vals.size(); i++)
sum_stream <<(double)d_counts[i]/total <<"\t";
sum_stream << endl;
for (i=0; i<b_vals.size(); i++)
sum_stream <<(double)r_counts[i]/total <<"\t";
sum_stream << endl;
}
if (out_file_stream.is_open())
out_file_stream.close();
}
/****************************************************************************
Runs benchmark on spectra with a complete de novo solution in there
*****************************************************************************/
void make_ranking_examples(AdvancedScoreModel *model,
char *mgf_test_file)
{
const int charge = 2;
Config *config;
FileManager fm;
FileSet fs;
vector<PrmGraph *> prm_ptrs;
PrmGraph opt_prm;
config = model->get_config();
config->set_use_spectrum_charge(2);
const mass_t tolerance = config->get_tolerance();
fm.init_from_file(config,mgf_test_file);
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
BasicSpecReader bsr;
QCPeak peaks[4000];
cout << "Test file : " << mgf_test_file << endl;
double start_t = time(NULL);
double num_pred_aa=0;
int spec_idx;
for (spec_idx=0; spec_idx<all_ssf.size(); spec_idx++)
{
MGF_single *ssf = (MGF_single *)all_ssf[spec_idx];
const string corr_pep_str = ssf->peptide.as_string(config);
const Peptide& full_pep = ssf->peptide;
const mass_t true_mass_with_19 = full_pep.get_mass_with_19();
const int correct_pep_length = ssf->peptide.get_num_aas();
BasicSpectrum bs;
Spectrum s;
// exclude pepitdes with M+16/Q-17
const vector<int>& aas = ssf->peptide.get_amino_acids();
int a;
for (a=0; a<aas.size(); a++)
if (aas[a]>Val)
break;
if (a<aas.size())
continue;
bs.ssf = all_ssf[spec_idx];
bs.peaks = peaks;
bs.num_peaks = bsr.read_basic_spec(config,fm,bs.ssf,bs.peaks);
s.init_from_QCPeaks(config,bs.peaks,bs.num_peaks,ssf);
// create the correct pm_graph, test for presence of optimal solutions
opt_prm.clear();
opt_prm.create_graph_from_spectrum(model,&s,s.get_true_mass_with_19(),s.get_charge());
SeqPath longest = opt_prm.get_longest_subpath(ssf->peptide,0);
// if (longest.get_num_aa()<6)
// continue;
bool has_complete_path = false;
if (longest.get_num_aa() == correct_pep_length)
has_complete_path = true;
// generate de novo solutions
vector<PmcSqsChargeRes> pmc_sqs_res;
vector<mass_t> pms_with_19;
vector<int> charges;
model->select_pms_and_charges(config,bs,pms_with_19,charges,true);
if (prm_ptrs.size()<pms_with_19.size())
prm_ptrs.resize(pms_with_19.size(),NULL);
vector<SeqPath> solutions;
generate_denovo_solutions_from_several_pms(
prm_ptrs,
model,
&s,
true,
400,
6,
14,
pms_with_19,
charges,
solutions,
true);
vector<mass_t> exp_cut_masses;
full_pep.calc_expected_breakage_masses(config, exp_cut_masses);
// if (solutions.size()>0)
// num_pred_aa += solutions[0].get_num_aa();
int min_denovo_correct_rank=NEG_INF;
int min_denovo_cut_correct_rank=POS_INF;
int s_idx;
bool has_correct_path_in_results=false;
for (s_idx=0; s_idx<solutions.size(); s_idx++)
{
if (solutions[s_idx].check_if_correct(corr_pep_str,config))
{
if (min_denovo_correct_rank<0)
min_denovo_correct_rank=s_idx;
solutions[s_idx].is_correct = true;
}
else
solutions[s_idx].is_correct = false;
if (solutions[s_idx].check_if_cut_correct(exp_cut_masses, tolerance))
{
if (min_denovo_cut_correct_rank == POS_INF)
min_denovo_cut_correct_rank=s_idx;
solutions[s_idx].is_cut_correct = true;
}
else
solutions[s_idx].is_correct = false;
solutions[s_idx].org_rank = s_idx;
}
// only include spectra that have a good path in the top xxx
// if (min_denovo_correct_rank==NEG_INF)
// continue;
// if the correct path is not in the solutions, look for one in the prm graphs and add it
if (min_denovo_correct_rank==NEG_INF)
{
cout << endl << spec_idx << " skipped..." << endl;
continue;
}
DeNovoRankScorer *drs = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
vector<score_pair> scores;
drs->score_denovo_sequences(solutions,ssf,peaks,bs.num_peaks,scores,-1);
sort(scores.begin(),scores.end());
int min_rerank_correct=NEG_INF;
int min_rerank_cut_correct=POS_INF;
int sol_rank=-1;
int j;
for (j=0; j<scores.size(); j++)
{
int org_rank = scores[j].idx;
if (min_rerank_correct<0 && solutions[org_rank].is_correct)
{
min_rerank_correct = j;
sol_rank = org_rank;
}
}
if (min_rerank_correct == 0 && min_denovo_correct_rank>0)
{
vector<SeqPath> example_paths;
example_paths.push_back(solutions[0]);
example_paths.push_back(solutions[sol_rank]);
cout << endl;
bs.ssf->print_ssf_stats(config);
cout << "Denovo rank 0 vs. rerank 0 ( original rank " << sol_rank << ")" << endl << endl;
vector<int> aas;
solutions[0].get_amino_acids(aas);
Peptide p;
p.set_peptide_aas(aas);
cout << "0\tdnv : " << p.as_string(config) << endl;
cout << sol_rank << "\trnk : " << bs.ssf->peptide.as_string(config) << endl;
drs->list_feature_differences(example_paths,bs.ssf,peaks,bs.num_peaks);
cout << endl;
}
}
}
void DeNovoRankScorer::give_de_novo_and_peak_match_examples(
const string& db_dir,
const string& correct_dir,
const string& denovo_dir,
const string& mgf_list,
const int charge,
const int size_idx)
{
vector<bool> file_indicators;
PeptideSetMap psm;
PeakRankModel *&peak_model = peak_prediction_models[model_type];
const vector< vector<mass_t> >& size_thresholds = peak_model->get_size_thresholds();
Config *config = model->get_config();
if (dnv_part_models.size()<=charge)
init_tables();
if (! dnv_part_models[charge][size_idx])
dnv_part_models[charge][size_idx] = new DeNovoPartitionModel;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -