📄 denovorankscore.cpp
字号:
{
cout << "Error: peak model not initialized!" << endl;
exit(1);
}
if (! comp_assigner)
{
cout << "Error: comp assigner not initialized!" << endl;
exit(1);
}
string dir = model->get_config()->get_resource_dir();
string base_path = dir + "/" + name;
string model_file_name = base_path + "_model.txt";
ofstream ofs(model_file_name.c_str());
if (! ofs.is_open() || ! ofs.good())
{
cout << "Error: couldn't open file for reading: " << model_file_name << endl;
exit(1);
}
ofs << name << " " << model_type << endl;
ofs << model->get_max_score_model_charge() << endl;
ofs << model->get_model_name() << endl;
ofs << peak_prediction_models[model_type]->get_peak_rank_model_name() << endl;
ofs << comp_assigner->get_model_name() << endl;
int c;
for (c=0; c<this->dnv_part_models.size(); c++)
{
int size_idx;
for (size_idx=0; size_idx<dnv_part_models[c].size(); size_idx++)
{
if (dnv_part_models[c][size_idx] &&
dnv_part_models[c][size_idx]->ind_was_initialized)
{
ostringstream oss;
oss << base_path << "_" << c << "_" << size_idx << ".txt";
string path = oss.str();
dnv_part_models[c][size_idx]->write_denovo_partition_model(this->model_type, path.c_str());
}
}
}
}
/*********************************************************************
Adds the counts for peaks around a breakage to their respective bins
**********************************************************************/
void add_offset_counts_for_unannotated_peaks_arround_mass(vector<int>& counts,
AnnotatedSpectrum *spec,
mass_t min_mass,
mass_t max_mass,
mass_t bin_coef,
mass_t break_mass,
int charge)
{
const vector< vector<PeakAnnotation> >& peak_anns = spec->get_peak_annotations();
const mass_t low_range = (break_mass + min_mass+1)/(mass_t)charge;
const mass_t high_range = (break_mass + max_mass)/(mass_t)charge;
const PeakRange pr = spec->get_peaks_in_range(low_range, high_range);
// cout << spec->get_peptide().as_string(spec->get_config()) << " " << break_mass << endl;
if (pr.num_peaks<=0)
return;
int skipped=0;
// add counts
int p_idx;
for (p_idx = pr.low_idx; p_idx<=pr.high_idx; p_idx++)
{
if (spec->get_peak_iso_level(p_idx)>0)
continue;
if (peak_anns[p_idx].size()>0)
{
skipped++;
continue;
}
const mass_t peak_mass = spec->get_peak_mass(p_idx);
const mass_t b_mass = break_mass/(mass_t)charge;
const int bin = (int)((peak_mass - b_mass - min_mass)*bin_coef);
counts[bin]+=10;
counts[bin-1]+=9;
counts[bin+1]+=9;
counts[bin-2]+=6;
counts[bin+2]+=6;
counts[bin-3]+=4;
counts[bin+3]+=4;
counts[bin-4]+=2;
counts[bin+4]+=2;
}
}
/***********************************************************************
Uses the offset count method to determine if there are any fragments
that are special for a given PTM (function only prints a list of
the most interesting cases).
************************************************************************/
void find_special_PTM_frags_using_offset_counts(
const string& PTM_label,
FileManager& fm,
const vector<SingleSpectrumFile *>& all_ssfs,
Model *model,
int max_charge)
{
Config *config = model->get_config();
const mass_t min_offset_mass = -120;
const mass_t max_offset_mass = 120;
const mass_t tolerance = config->get_tolerance();
const mass_t bin_size = tolerance * 0.1;
const mass_t bin_coef = 1.0 / bin_size;
const int count_size = (int)((max_offset_mass - min_offset_mass + 1) / bin_size);
vector< vector< vector<int> > > prefix_counts, suffix_counts; // charge, distance from cut, bin_idx
vector< vector< int > > pre_instances, suf_instances;
int i,c,d;
float min_frag_prob = 0.02;
const int ptm_aa = config->get_aa_from_label(PTM_label);
if (ptm_aa <0)
{
cout << "Error: PTM not supported in this model: " << PTM_label << endl;
exit(1);
}
const int max_distance = 1;
prefix_counts.resize(max_charge+1);
suffix_counts.resize(max_charge+1);
pre_instances.resize(max_charge+1);
suf_instances.resize(max_charge+1);
for (c=1; c<=max_charge; c++)
{
prefix_counts[c].resize(max_distance+1);
suffix_counts[c].resize(max_distance+1);
pre_instances[c].resize(max_distance+1,0);
suf_instances[c].resize(max_distance+1,0);
for (d=0; d<=max_distance; d++)
{
prefix_counts[c][d].resize(count_size,0);
suffix_counts[c][d].resize(count_size,0);
}
}
vector<QCPeak> peaks;
BasicSpecReader bsr;
peaks.resize(5000);
int spectra_used=0;
for (i=0; i<all_ssfs.size(); i++)
{
AnnotatedSpectrum as;
vector<mass_t> break_masses;
const Peptide& pep = all_ssfs[i]->peptide;
const vector<int>& amino_acids = pep.get_amino_acids();
int aa_idx;
for (aa_idx = 0; aa_idx<amino_acids.size(); aa_idx++)
if (amino_acids[aa_idx] == ptm_aa)
break;
if (aa_idx == amino_acids.size())
continue;
int num_peaks = bsr.read_basic_spec(config,fm,all_ssfs[i],&peaks[0]);
as.init_from_QCPeaks(config,&peaks[0],num_peaks,all_ssfs[i]);
as.set_peptide(pep);
as.annotate_spectrum(pep.get_mass_with_19(),true);
spectra_used++;
pep.calc_expected_breakage_masses(config,break_masses);
const mass_t true_mass = as.get_peptide().get_mass();
for (aa_idx=0; aa_idx<amino_acids.size(); aa_idx++)
{
if (amino_acids[aa_idx] != ptm_aa)
continue;
int d;
for (d=0; d<=max_distance; d++)
{
int cut_idx = aa_idx + 1 -d;
if (cut_idx<1)
break;
int c;
const mass_t break_mass = break_masses[cut_idx];
for (c=1; c<=max_charge; c++)
{
// cout << "P: " << c << " " << d << " ";
add_offset_counts_for_unannotated_peaks_arround_mass(prefix_counts[c][d], &as,
min_offset_mass, max_offset_mass, bin_coef, break_mass,c);
pre_instances[c][d]++;
}
}
for (d=0; d<=max_distance; d++)
{
int cut_idx = aa_idx + d;
if (cut_idx>amino_acids.size())
break;
int c;
const mass_t break_mass = break_masses[cut_idx];
for (c=1; c<=max_charge; c++)
{
// cout << "S: " << c << " " << d << " ";
add_offset_counts_for_unannotated_peaks_arround_mass(suffix_counts[c][d], &as,
min_offset_mass, max_offset_mass, bin_coef, (true_mass - break_mass),c);
suf_instances[c][d]++;
}
}
}
}
cout << "Using: " << spectra_used << " spectra for offset counts..." << endl;
// select 30 top fragments becasue many are likely to be caused by previous/next
// amino acids and will be later removed
vector<FragmentTypeSet> pre_fts , suf_fts;
pre_fts.resize(max_distance+1);
suf_fts.resize(max_distance+1);
for (c=1; c<=max_charge; c++)
{
int d;
for (d=0; d<=max_distance; d++)
{
select_fragments_from_bins(prefix_counts[c][d],pre_fts[d],30,c,PREFIX,min_offset_mass,bin_coef,tolerance);
select_fragments_from_bins(suffix_counts[c][d],suf_fts[d],30,c,SUFFIX,min_offset_mass,bin_coef,tolerance);
/* int f;
for (f=0; f<pre_fts[d].get_num_fragments(); f++)
{
FragmentType& frag = pre_fts[d].get_non_const_fragment(f);
frag.prob = frag.prob / (float)pre_instances[frag.charge][d];
}
pre_fts[d].sort_fragments_according_to_probs();
for (f=0; f<suf_fts[d].get_num_fragments(); f++)
{
FragmentType& frag = suf_fts[d].get_non_const_fragment(f);
frag.prob = frag.prob / (float)suf_instances[frag.charge][d];
}
suf_fts[d].sort_fragments_according_to_probs();*/
}
}
for (d=0; d<=max_distance; d++)
{
// calculate_true_fragment_probabilities(fm,config,fts[d], min_frag_prob);
cout << "Fragments selected from spectra [distance " << d << "]:" << endl;
for (i=0; i<pre_fts[d].get_num_fragments() && i<5; i++)
{
const FragmentType& frag = pre_fts[d].get_fragment(i);
cout << left << setw(3) << i << right << setw(5) << frag.spec_count << " ";
cout << setw(6) << setprecision(3) << right << frag.prob << " ";
frag.write_fragment(cout);
}
cout << endl;
for (i=0; i<suf_fts[d].get_num_fragments() && i<5; i++)
{
const FragmentType& frag = suf_fts[d].get_fragment(i);
cout << left << setw(3) << i << right << setw(5) << frag.spec_count << " ";
cout << setw(6) << setprecision(3) << right << frag.prob << " ";
frag.write_fragment(cout);
}
cout << endl;
}
}
/****************************************************************************
Runs benchmark on spectra with a complete de novo solution in there
*****************************************************************************/
void benchmark_ranking_on_full_denovo(AdvancedScoreModel *model,
char *mgf_test_file,
int max_num_spectra,
int num_solutions,
char *report_path,
int min_length,
int max_length)
{
const int charge = 2;
Config *config;
char report_file_path[512];
char *report_name=NULL;
ofstream out_file_stream;
if (report_path)
{
sprintf(report_file_path,"%s_bench_test.txt",report_path);
report_name = report_file_path;
out_file_stream.open(report_file_path);
cout << "Writing results to " << report_file_path << endl;
}
else
cout << "Writing results to cout..." << endl;
ostream& sum_stream = (report_path ? out_file_stream : cout);
FileManager fm;
FileSet fs;
vector<PrmGraph *> prm_ptrs;
PrmGraph opt_prm;
config = model->get_config();
config->set_use_spectrum_charge(2);
const mass_t tolerance = config->get_tolerance();
fm.init_from_file(config,mgf_test_file);
fs.select_all_files(fm);
fs.randomly_reduce_ssfs((int)(max_num_spectra * 1.5));
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
// 0 all seqs , 1 complete seqs, 2 all cuts ,3 complete cuts
vector< vector<int> > top_rerank, top_denovo;
vector<double> total_corr_denovo, total_pred_denovo, total_corr_rerank, total_pred_rerank;
top_rerank.resize(4);
top_denovo.resize(4);
total_corr_denovo.resize(4,0);
total_pred_denovo.resize(4,0);
total_corr_rerank.resize(4,0);
total_pred_rerank.resize(4,0);
BasicSpecReader bsr;
QCPeak peaks[4000];
int num_spec_tested=0;
int num_top_denovo_correct=0;
int num_top_rerank_correct=0;
sum_stream << "Test file : " << mgf_test_file << endl;
sum_stream << "Num cases : " << max_num_spectra << endl;
sum_stream << "Num solutions : " << num_solutions << endl << endl;
sum_stream << "START TESTING..." << endl;
double start_t = time(NULL);
double num_pred_aa=0;
int spec_idx;
for (spec_idx=0; spec_idx<all_ssf.size(); spec_idx++)
{
MGF_single *ssf = (MGF_single *)all_ssf[spec_idx];
const string corr_pep_str = ssf->peptide.as_string(config);
const Peptide& full_pep = ssf->peptide;
const mass_t true_mass_with_19 = full_pep.get_mass_with_19();
const int correct_pep_length = ssf->peptide.get_num_aas();
BasicSpectrum bs;
Spectrum s;
// exclude pepitdes with M+16/Q-17
const vector<int>& aas = ssf->peptide.get_amino_acids();
int a;
for (a=0; a<aas.size(); a++)
if (aas[a]>Val)
break;
if (a<aas.size())
continue;
bs.ssf = all_ssf[spec_idx];
bs.peaks = peaks;
bs.num_peaks = bsr.read_basic_spec(config,fm,bs.ssf,bs.peaks);
s.init_from_QCPeaks(config,bs.peaks,bs.num_peaks,ssf);
// create the correct pm_graph, test for presence of optimal solutions
opt_prm.clear();
opt_prm.create_graph_from_spectrum(model,&s,s.get_true_mass_with_19(),s.get_charge());
SeqPath longest = opt_prm.get_longest_subpath(ssf->peptide,0);
// if (longest.get_num_aa()<6)
// continue;
bool has_complete_path = false;
if (longest.get_num_aa() == correct_pep_length)
has_complete_path = true;
// generate de novo solutions
vector<PmcSqsChargeRes> pmc_sqs_res;
vector<mass_t> pms_with_19;
vector<int> charges;
model->select_pms_and_charges(config,bs,pms_with_19,charges,true);
if (prm_ptrs.size()<pms_with_19.size())
prm_ptrs.resize(pms_with_19.size(),NULL);
vector<SeqPath> solutions;
generate_denovo_solutions_from_several_pms(
prm_ptrs,
model,
&s,
true,
num_solutions,
6,
14,
pms_with_19,
charges,
solutions,
false);
vector<mass_t> exp_cut_masses;
full_pep.calc_expected_breakage_masses(config, exp_cut_masses);
// if (solutions.size()>0)
// num_pred_aa += solutions[0].get_num_aa();
int min_denovo_correct_rank=NEG_INF;
int min_denovo_cut_correct_rank=POS_INF;
int s_idx;
bool has_correct_path_in_results=false;
for (s_idx=0; s_idx<solutions.size(); s_idx++)
{
if (solutions[s_idx].check_if_correct(corr_pep_str,config))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -