📄 denovosolutions.cpp
字号:
#include "DeNovoSolutions.h"
#include "DeNovoRankScore.h"
#include "auxfun.h"
mass_t SeqPathKey::tolerance;
int num_messages=0;
/**************************************************************************
Adds a new seqpath to the existing vector.
If vector is at maximal size, can replace the lowest confidence path.
If a path exists with the same seq and n,c masses but lower confidence
score, replaces it.
return values:
0 - score not high enough to be intered in heap
1 - solution already exists with higher score
2 - added path with no need to remove something in the heap
3 - added path but removed a different one from the heap
***************************************************************************/
int SeqPathHeap::add_path(SeqPath& new_path, bool verbose)
{
if (new_path.sort_key<min_value && paths.size() >= max_size)
return 0;
SeqPathKey new_key(new_path);
set<SeqPathKey>::iterator it=path_keys.find(new_key);
if (it != path_keys.end())
{
if (verbose)
{
cout << "Same: " << new_path.seq_str << " " << new_path.n_term_mass << " " << new_path.pm_with_19 << " " << new_path.path_score << " " << new_path.seq_prob << endl;
cout << "Same: " << it->pep_str << " " << it->n_mass << " " << it->sort_key << endl;
}
if (it->sort_key < new_path.sort_key)
{
if (verbose)
{
cout << "Replaced " << setprecision(6) << it->sort_key << " with " << new_path.sort_key << endl;
cout << it->n_mass << " : " << new_path.n_term_mass << endl;
cout << it->pep_str << " : " << new_path.seq_str << endl;
}
new_key.path_pos_idx = it->path_pos_idx;
paths[it->path_pos_idx]=new_path;
path_keys.erase(it);
path_keys.insert(new_key);
}
// no update of the heap value
return 1;
}
// add path, no need to remove
if (paths.size() < max_size)
{
new_key.path_pos_idx = paths.size();
paths.push_back(new_path);
path_keys.insert(new_key);
min_score_heap.push_back(idx_score_pair(new_key));
push_heap(min_score_heap.begin(),min_score_heap.end());
min_idx = min_score_heap[0].path_pos_idx;
min_value = min_score_heap[0].sort_key;
return 2;
}
// remove lowest score and then add
new_key.path_pos_idx = min_idx;
SeqPathKey remove_key(paths[min_idx]);
it=path_keys.find(remove_key);
if (it == path_keys.end())
{
// if (num_messages++<10)
// cout << "Warning: couldn't find remove key!" << endl;
// return;
}
paths[min_idx] = new_path;
if (it != path_keys.end())
path_keys.erase(it);
path_keys.insert(new_key);
pop_heap(min_score_heap.begin(),min_score_heap.end());
min_score_heap.pop_back();
min_score_heap.push_back(idx_score_pair(new_key));
push_heap(min_score_heap.begin(),min_score_heap.end());
min_idx = min_score_heap[0].path_pos_idx;
min_value = min_score_heap[0].sort_key;
return 3;
}
void print_prm_graph_scores(Model *model, Spectrum *spec,
mass_t pm_with_19, int charge)
{
PrmGraph prm;
Config *config= model->get_config();
if (charge> model->get_max_score_model_charge() || charge<1)
{
cout << "Charge " << charge << " not supported..." << endl;
return;
}
spec->set_charge(charge);
model->init_model_for_scoring_spectrum(spec);
mass_t opt_pm_with_19 = prm.find_optimal_pm_with_19_for_graph(model,spec,pm_with_19,charge);
spec->set_corrected_pm_with_19(opt_pm_with_19);
if (! strcmp(model->get_model_name().c_str(),"ETD"))
{
vector<string> labels;
labels.push_back("c");
labels.push_back("z");
spec->print_expected_fragment_peaks(labels,cout);
}
else
spec->print_expected_by();
prm.create_graph_from_spectrum(model,spec,opt_pm_with_19);
model->score_graph_edges(prm);
prm.add_edge_scores_to_nodes();
prm.print_only_scores();
cout << endl;
}
/**************************************************************************
Wrapper funciton that generates the desired solutions.
Combines both local and global solutions (similar to the PepNovoTag
and LocalTag solutions).
***************************************************************************/
bool generate_denovo_solutions(PrmGraph*& prm,
Model *model,
Spectrum *spec,
bool denovo_mode,
mass_t pm_with_19,
int charge,
int num_sols,
int min_length,
int max_length,
score_t min_score_needed,
vector<SeqPath>& solutions,
bool only_complete,
bool only_from_graph_containing_true_pep,
bool need_to_create_PrmGraph)
{
DeNovoDp dndp;
Config *config= model->get_config();
const score_t forbidden_pair_penalty = config->get_forbidden_pair_penalty();
// shorten the generated sequences to improve the accuracy
if (max_length>10)
{
if (pm_with_19>1800)
{
if (min_length>=10)
{
min_length=8;
max_length=14;
}
else
max_length=14;
}
if (pm_with_19>2200)
{
if (min_length>=10)
{
min_length=8;
max_length=12;
}
else
max_length=12;
}
}
if (! prm)
prm = new PrmGraph;
if (need_to_create_PrmGraph)
{
spec->set_charge(charge);
model->init_model_for_scoring_spectrum(spec);
prm->create_graph_from_spectrum(model,spec,pm_with_19,charge);
model->score_graph_edges(*prm);
}
// prm->print_with_multi_edges();
// exit(0);
if (only_from_graph_containing_true_pep)
{
const Peptide& true_pep = spec->get_peptide();
if (true_pep.get_num_aas()<3)
{
cout << "Error: looking for graphs containing peptide path, but no peptide was given!" << endl;
exit(1);
}
SeqPath best_path = prm->get_longest_subpath(true_pep,0);
if (best_path.get_num_aa() < true_pep.get_num_aas())
return false;
}
dndp.fill_dp_table(prm,config->get_forbidden_pair_penalty());
SeqPath best = prm->get_longest_subpath(spec->get_peptide(),0);
// best.print_full(config);
if (1)
{
SeqPathHeap denovo_path_heap;
vector<SeqPath> seq_paths;
vector<MultiPath> multi_paths;
// generate global tags from parsing de novo paths
denovo_path_heap.init(num_sols,config->get_tolerance());
int num_multi_paths=num_sols;
if (num_sols<2000)
num_multi_paths=2000;
dndp.get_top_scoring_antisymetric_paths_with_length_limits(multi_paths,
num_multi_paths, min_length , max_length, forbidden_pair_penalty, min_score_needed,
denovo_mode, only_complete);
prm->expand_all_multi_paths(model, multi_paths, seq_paths, forbidden_pair_penalty, num_sols);
int i;
for (i=0; i<seq_paths.size(); i++)
{
seq_paths[i].pm_with_19 = pm_with_19;
seq_paths[i].charge = charge;
seq_paths[i].make_seq_str(config);
seq_paths[i].sort_key = seq_paths[i].path_score;
// if peptide is whole, make sure its mass is within the pm tolerance
if (seq_paths[i].n_term_mass<1.0 && seq_paths[i].c_term_mass + 30.0 > seq_paths[i].pm_with_19)
{
const mass_t pep_mass = seq_paths[i].calculate_peptide_mass(config) + MASS_OHHH;
const mass_t delta = fabs(pep_mass - spec->get_org_pm_with_19());
// cout << i << "\t" << pep_mass << "\t" << spec->get_org_pm_with_19() << "\t" << delta << endl;
// if (delta > config->get_pm_tolerance())
// continue;
}
denovo_path_heap.add_path(seq_paths[i]);
}
sort(denovo_path_heap.paths.begin(),denovo_path_heap.paths.end(),comp_SeqPath_path_score);
solutions = denovo_path_heap.paths;
for (i=0; i<solutions.size(); i++)
{
solutions[i].delta_score = solutions[0].path_score - solutions[i].path_score;
solutions[i].make_seq_str(config);
}
}
// cout << "SOLS: " << solutions.size() << " " << min_score_needed << endl;
return true;
}
/**************************************************************************
Wrapper funciton that generates the desired solutions.
Combines both local and global solutions (similar to the PepNovoTag
and LocalTag solutions).
***************************************************************************/
bool generate_denovo_solutions_with_good_start_end_idxs(
PrmGraph*& prm,
Model *model,
Spectrum *spec,
bool denovo_mode,
mass_t pm_with_19,
int charge,
int num_sols,
int min_length,
int max_length,
vector<SeqPath>& solutions)
{
DeNovoDp dndp;
Config *config= model->get_config();
const score_t forbidden_pair_penalty = config->get_forbidden_pair_penalty();
if (charge>3 || charge<1)
{
solutions.clear();
return false;
}
if (! prm)
prm = new PrmGraph;
spec->set_charge(charge);
model->init_model_for_scoring_spectrum(spec);
prm->create_graph_from_spectrum(model,spec,pm_with_19);
model->score_graph_edges(*prm);
dndp.fill_dp_table(prm,forbidden_pair_penalty);
// find which nodes are good start/end points
const vector<Node>& nodes = prm->get_nodes();
vector<bool> good_idx_inds;
good_idx_inds.resize(nodes.size(),false);
int m_idx=0,n_idx=0;
vector<mass_t> good_masses;
spec->get_peptide().calc_expected_breakage_masses(config,good_masses);
const int num_nodes = nodes.size(), num_masses = good_masses.size();
good_idx_inds[0]=true;
good_idx_inds[num_nodes-1]=true;
while (n_idx<num_nodes && m_idx<num_masses)
{
const mass_t m_mass = good_masses[m_idx];
const mass_t n_mass = nodes[n_idx].mass;
if (fabs(m_mass-n_mass)<1.25)
good_idx_inds[n_idx]=true;
if (m_mass<n_mass-2)
{
m_idx++;
}
else
n_idx++;
}
SeqPathHeap denovo_path_heap;
vector<MultiPath> multi_paths;
// generate global tags from parsing de novo paths
denovo_path_heap.init(num_sols,config->get_tolerance());
dndp.get_top_scoring_antisymetric_paths_with_specified_start_end_idxs(
good_idx_inds, multi_paths, num_sols, min_length, max_length, config->get_forbidden_pair_penalty());
vector<SeqPath> seq_paths;
prm->expand_all_multi_paths(model, multi_paths,seq_paths, forbidden_pair_penalty, num_sols);
// parse seq paths
int i;
for (i=0; i<seq_paths.size(); i++)
{
seq_paths[i].sort_key = seq_paths[i].path_score;
denovo_path_heap.add_path(seq_paths[i]);
}
sort(denovo_path_heap.paths.begin(),denovo_path_heap.paths.end(),comp_SeqPath_path_score);
solutions = denovo_path_heap.paths;
return true;
}
/***************************************************************************
Wrapper function that generates several solutions according to different
precursor masses.
****************************************************************************/
void generate_denovo_solutions_from_several_pms(vector<PrmGraph *>& prm_ptrs,
AdvancedScoreModel *model,
Spectrum *spec,
bool ind_denovo_mode,
int num_sols,
int min_length,
int max_length,
vector<mass_t>& different_pms_with_19,
vector<int>& charges,
vector<SeqPath>& best_solutions,
bool ind_only_complete)
{
static SeqPathHeap seq_heap;
best_solutions.clear();
seq_heap.init(num_sols, model->get_config()->get_tolerance());
int i;
for (i=0; i<different_pms_with_19.size(); i++)
{
const score_t min_score_needed = ( (! ind_denovo_mode || i==0 || seq_heap.min_score_heap.size()<num_sols) ?
NEG_INF : seq_heap.min_value );
vector<SeqPath> sols;
generate_denovo_solutions(prm_ptrs[i], model,spec, ind_denovo_mode,
different_pms_with_19[i],charges[i],num_sols,min_length, max_length,
min_score_needed, sols, ind_only_complete);
// removes duplicated
int j;
for (j=0; j<sols.size(); j++)
{
seq_heap.add_path(sols[j]);
}
// impose time limit
}
best_solutions = seq_heap.paths;
if (ind_denovo_mode)
{
sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_path_score);
}
else
sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_sort_key);
for (i=0; i<best_solutions.size(); i++)
best_solutions[i].org_rank=i;
}
void generate_denovo_solutions_from_several_pms_with_good_start_end_idxs(
vector<PrmGraph *>& prm_ptrs,
AdvancedScoreModel *model,
Spectrum *spec,
bool ind_denovo_mode,
int num_sols, int min_length, int max_length,
vector<mass_t>& different_pms_with_19,
vector<int>& charges,
vector<SeqPath>& best_solutions)
{
static SeqPathHeap seq_heap;
best_solutions.clear();
seq_heap.init(num_sols, model->get_config()->get_tolerance());
int i;
for (i=0; i<different_pms_with_19.size(); i++)
{
vector<SeqPath> sols;
generate_denovo_solutions_with_good_start_end_idxs(prm_ptrs[i], model,spec, ind_denovo_mode,
different_pms_with_19[i],charges[i],num_sols,min_length,max_length,sols);
// removes duplicated
int j;
for (j=0; j<sols.size(); j++)
seq_heap.add_path(sols[j]);
}
best_solutions = seq_heap.paths;
if (ind_denovo_mode)
{
sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_path_score);
}
else
sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_sort_key);
for (i=0; i<best_solutions.size(); i++)
best_solutions[i].org_rank=i;
}
/***************************************************************************
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -