📄 multipath.cpp
字号:
#include "PrmGraph.h"
struct pos_score_pair {
pos_score_pair() : pos(NEG_INF), score(NEG_INF) {};
bool operator< (const pos_score_pair& other) const
{
return (score > other.score);
}
int pos;
score_t score;
};
// This struct uses a simple score based comparison
struct PathHeap {
public:
void init(int size)
{
heap_size = size;
paths.resize(heap_size);
score_pairs.resize(heap_size);
int i;
for (i=0; i<heap_size; i++)
{
score_pairs[i].pos = i;
score_pairs[i].score = NEG_INF;
}
}
score_t get_min_score() const { return score_pairs[0].score; }
int get_num_real_entries() const
{
int i,n=0;
for (i=0; i<score_pairs.size(); i++)
if (score_pairs[i].score>NEG_INF)
n++;
return n;
}
void add_path(const SeqPath& path)
{
if (path.path_score<=score_pairs[0].score)
return;
const int pos = score_pairs[0].pos;
paths[pos]=path;
pop_heap(score_pairs.begin(),score_pairs.end());
score_pairs[heap_size-1].pos = pos;
score_pairs[heap_size-1].score = path.path_score;
push_heap(score_pairs.begin(),score_pairs.end());
}
void sort_paths() { sort(paths.begin(),paths.end(),comp_SeqPath_path_score); }
vector<SeqPath> get_paths() { return paths; }
private:
int heap_size;
vector<pos_score_pair> score_pairs;
vector<SeqPath> paths; // this is a min heap smallest score in front!
};
// add the variant ptr and scores for this combo
void PrmGraph::add_and_score_edge_variants(const AA_combo& aa_combo, MultiEdge& edge)
{
const vector<int>& aa_positions = config->get_aa_positions();
const bool reaches_n_term = (nodes[edge.n_idx].type == NODE_N_TERM);
const bool reaches_c_term = (nodes[edge.c_idx].type == NODE_C_TERM);
int v;
int *variant_ptr = (int *)config->get_variant_ptr(aa_combo.variant_start_idx);
for (v=0; v<aa_combo.num_variants; v++)
{
int num_aa = *variant_ptr;
int *aas = variant_ptr+1;
int i;
for (i=0; i<num_aa; i++)
if (aa_positions[*(aas+i)])
break;
// need to check that the variant is not violating any of the position restrictions
// such as +1 -1 positions
if (i<num_aa)
{
if (reaches_n_term)
{
int a;
for (a=0; a<num_aa; a++)
{
int aa_idx = aas[a];
if (aa_positions[aa_idx] != 0 && aa_positions[aa_idx] != a+1)
break;
}
if (a<num_aa)
{
variant_ptr+= num_aa +1;
continue;
}
}
else // check for +1 positions
{
int a;
for (a=0; a<num_aa; a++)
{
int aa_idx = aas[a];
if (aa_positions[aa_idx] == 1)
break;
}
if (a<num_aa)
{
variant_ptr+= num_aa +1;
continue;
}
}
if (reaches_c_term)
{
int a;
for (a=0; a<num_aa; a++)
{
int aa_idx = aas[a];
if (aa_positions[aa_idx] != 0 && aa_positions[aa_idx] != a-num_aa )
break;
}
if (a<num_aa) // found a problem with one of the aa positions
{
variant_ptr+= num_aa +1;
continue;
}
}
else // check for -1 positions
{
int a;
for (a=0; a<num_aa; a++)
{
int aa_idx = aas[a];
if (aa_positions[aa_idx] == -1)
break;
}
if (a<num_aa)
{
variant_ptr+= num_aa +1;
continue;
}
}
}
score_t variant_score = calc_edge_variant_score(edge,num_aa,aas);
if (variant_score>edge.max_variant_score)
edge.max_variant_score = variant_score;
edge.variant_ptrs.push_back(variant_ptr);
edge.variant_scores.push_back(variant_score);
edge.variant_probs.push_back(0);
variant_ptr+= num_aa +1;
}
}
// adds the relevant PathPos to the path and adjusts the other non-terminal values
void SeqPath::add_edge_variant(const MultiEdge& edge, int e_idx, int variant_idx)
{
PathPos new_pos;
int *variant_ptr = edge.variant_ptrs[variant_idx];
int num_aa = *variant_ptr++;
int *aas = variant_ptr;
if (num_aa != edge.num_aa)
{
cout << "Error: edge and variant mixup!" << endl;
exit(1);
}
num_aa += edge.num_aa;
new_pos.breakage = edge.n_break;
new_pos.edge_idx = e_idx;
new_pos.mass = edge.n_break->mass;
new_pos.edge_variant_score = edge.variant_scores[variant_idx];
new_pos.node_score = edge.n_break->score;
new_pos.node_idx = edge.n_idx;
new_pos.aa = aas[0];
path_score += new_pos.edge_variant_score + new_pos.node_score;
positions.push_back(new_pos);
if (edge.num_aa == 1)
return;
int i;
for (i=1; i<edge.num_aa; i++)
{
PathPos new_pos;
new_pos.breakage = NULL;
new_pos.aa = aas[i]; // the rest of the fields are initialized to the default (NULL) values
positions.push_back(new_pos);
}
}
/************************************************************************
Expands the single multi path into all possible sequence variants.
This is done incrementaly by expanding each multi edges
*************************************************************************/
void PrmGraph::expand_multi_path(const MultiPath& multi_path,
vector<SeqPath>& seq_paths,
score_t min_score,
score_t forbidden_pair_penalty,
int max_num_paths) const
{
const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos();
vector<score_t> max_attainable_scores;
const int num_edges = multi_path.edge_idxs.size();
max_attainable_scores.resize(num_edges+1,0);
int e;
for (e=num_edges-1; e>=0; e--)
{
const int e_idx = multi_path.edge_idxs[e];
const MultiEdge& edge = multi_edges[e_idx];
max_attainable_scores[e]= max_attainable_scores[e+1] +
edge.max_variant_score + edge.c_break->score;
}
if (min_score>multi_edges[multi_path.edge_idxs[0]].n_break->score + max_attainable_scores[0])
return;
int i;
seq_paths.resize(1);
seq_paths[0].n_term_aa = multi_path.n_term_aa;
seq_paths[0].c_term_aa = multi_path.c_term_aa;
seq_paths[0].n_term_mass = multi_path.n_term_mass;
seq_paths[0].c_term_mass = multi_path.c_term_mass;
seq_paths[0].multi_path_rank = multi_path.original_rank;
seq_paths[0].path_score = 0;
seq_paths[0].prm_ptr = (PrmGraph *)this;
seq_paths[0].positions.clear();
for (e=0; e<multi_path.edge_idxs.size(); e++)
{
const int e_idx = multi_path.edge_idxs[e];
const MultiEdge& edge = multi_edges[e_idx];
if (edge.get_num_variants() == 1)
{
const int * variant_ptr = edge.variant_ptrs[0];
const int num_aa = *variant_ptr++;
const int *aas = variant_ptr;
int i;
for (i=0; i<seq_paths.size(); i++)
{
if (seq_paths[i].path_score>NEG_INF &&
seq_paths[i].path_score + max_attainable_scores[e] > min_score)
{
seq_paths[i].add_edge_variant(edge,e_idx,0);
}
else
seq_paths[i].path_score = NEG_INF;
}
}
else
{
vector<SeqPath> old_paths = seq_paths;
seq_paths.resize(seq_paths.size()*edge.get_num_variants());
int v;
int idx=0;
for (v=0; v<edge.get_num_variants(); v++)
{
int i;
for (i=0; i<old_paths.size(); i++)
seq_paths[idx++]=old_paths[i];
}
idx=0;
for (v=0; v<edge.get_num_variants(); v++)
{
const int * variant_ptr = edge.variant_ptrs[v];
const int num_aa = *variant_ptr++;
const int *aas = variant_ptr;
int i;
for (i=0; i<old_paths.size(); i++)
{
if (seq_paths[idx].path_score>NEG_INF &&
seq_paths[idx].path_score + max_attainable_scores[e] > min_score)
{
seq_paths[idx].add_edge_variant(edge,e_idx,v);
}
else
seq_paths[idx].path_score=NEG_INF;
idx++;
}
}
}
// if too many paths are here sort and pop back
if (max_num_paths>0 && seq_paths.size()>max_num_paths)
{
sort(seq_paths.begin(),seq_paths.end(),comp_SeqPath_path_score);
while (seq_paths.size()>max_num_paths && seq_paths[seq_paths.size()-1].path_score == NEG_INF)
seq_paths.pop_back();
}
else // remove all seq paths with NEG_INF score
{
int last_idx = seq_paths.size()-1;
int i;
for (i=0; i<last_idx; i++)
{
while (last_idx>i && seq_paths[last_idx].path_score == NEG_INF)
last_idx--;
if (seq_paths[i].path_score == NEG_INF)
{
seq_paths[i]=seq_paths[last_idx];
seq_paths[last_idx].path_score = NEG_INF;
}
}
while (seq_paths.size()>0 && seq_paths[seq_paths.size()-1].path_score == NEG_INF)
seq_paths.pop_back();
}
}
const MultiEdge& last_edge = multi_edges[multi_path.edge_idxs[multi_path.edge_idxs.size()-1]];
PathPos last_pos;
last_pos.breakage = last_edge.c_break;
last_pos.edge_idx =-1;
last_pos.mass = last_edge.c_break->mass;
last_pos.node_idx = last_edge.c_idx;
last_pos.node_score = last_edge.c_break->score;
for (i=0; i<seq_paths.size(); i++)
{
SeqPath& path= seq_paths[i];
path.path_score += last_pos.node_score;
path.num_forbidden_nodes = multi_path.num_forbidden_nodes;
path.path_score -= (forbidden_pair_penalty * path.num_forbidden_nodes);
path.positions.push_back(last_pos);
path.make_seq_str(config);
}
}
struct var_combo {
var_combo() : path_score(0) {};
bool operator< (const var_combo& other) const
{
return (path_score>other.path_score);
}
vector<int> var_idxs;
vector<score_t> node_scores; // dim = #var_idxs+1
score_t path_score;
};
/************************************************************************
Expands the single multi path into all possible sequence variants.
Since this turns out to be the time-limiting process for long de nvoo,
this is implemented using a quick branch and bound that works on
edge variant indices. The SeqPaths are created only for the final
set of sequences.
*************************************************************************/
void PrmGraph::fast_expand_multi_path(const MultiPath& multi_path,
vector<SeqPath>& seq_paths,
score_t min_score,
score_t forbidden_pair_penalty,
int max_num_paths) const
{
const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos();
vector<score_t> max_attainable_scores;
const int num_edges = multi_path.edge_idxs.size();
max_attainable_scores.resize(num_edges+1,0);
int num_path_variants=1;
int e;
for (e=num_edges-1; e>=0; e--)
{
const int e_idx = multi_path.edge_idxs[e];
const MultiEdge& edge = multi_edges[e_idx];
max_attainable_scores[e]= max_attainable_scores[e+1] +
edge.max_variant_score + edge.c_break->score;
num_path_variants *= edge.variant_ptrs.size();
}
const score_t max_path_score = multi_edges[multi_path.edge_idxs[0]].n_break->score +
max_attainable_scores[0];
const score_t min_delta_allowed = min_score - max_path_score;
if (min_delta_allowed>0)
return; // this multipath won't help much
if (num_path_variants == 0)
{
cout << "Error: had an edge with 0 variants!" <<endl;
exit(1);
}
// perform expansion using a heap and condensed path representation
vector<int> var_positions; // holds the edge idxs
vector<int> num_vars;
vector<int> var_edge_positions;
vector< vector< score_t > > var_scores;
var_scores.clear();
var_positions.clear();
num_vars.clear();
int num_aas_in_multipath = 0;
for (e=0; e<num_edges; e++)
{
const int e_idx = multi_path.edge_idxs[e];
const MultiEdge& edge = multi_edges[e_idx];
const int num_vars_in_edge = edge.variant_ptrs.size();
vector<score_t> scores;
num_aas_in_multipath+=edge.num_aa;
if (num_vars_in_edge>1)
{
int i;
for (i=0; i<num_vars_in_edge; i++)
scores.push_back(edge.variant_scores[i]);
var_scores.push_back(scores);
var_positions.push_back(e);
num_vars.push_back(num_vars_in_edge);
var_edge_positions.push_back(num_aas_in_multipath-edge.num_aa);
// cout << e << "\t" << e_idx << "\t" << num_vars_in_edge << "\t" << edge.num_aa << endl;
}
}
// create the SeqPaths from the edge_combos...
const int num_positions = num_aas_in_multipath+1;
SeqPath template_path; // holds common elements to all paths
template_path.n_term_aa = multi_path.n_term_aa;
template_path.c_term_aa = multi_path.c_term_aa;
template_path.n_term_mass = multi_path.n_term_mass;
template_path.c_term_mass = multi_path.c_term_mass;
template_path.multi_path_rank = multi_path.original_rank;
template_path.path_score = 0;
template_path.prm_ptr = (PrmGraph *)this;
template_path.positions.clear();
template_path.positions.resize(num_positions);
template_path.num_forbidden_nodes = multi_path.num_forbidden_nodes;
template_path.path_score -= template_path.num_forbidden_nodes * forbidden_pair_penalty;
int pos_idx=0;
for (e=0; e<multi_path.edge_idxs.size(); e++)
{
const int e_idx = multi_path.edge_idxs[e];
const MultiEdge& edge = get_multi_edge(e_idx);
PathPos& pos = template_path.positions[pos_idx++];
pos.breakage = edge.n_break;
pos.edge_idx = e_idx;
pos.mass = edge.n_break->mass;
pos.node_score = edge.n_break->score;
pos.node_idx = edge.n_idx;
template_path.path_score += pos.node_score;
int *variant_ptr = edge.variant_ptrs[0];
const int num_aa = *variant_ptr++;
int *aas = variant_ptr;
if (edge.variant_ptrs.size() == 1)
{
pos.edge_variant_score = edge.variant_scores[0];
pos.aa = aas[0];
template_path.path_score += pos.edge_variant_score;
}
if (edge.num_aa == 1)
continue;
int j;
for (j=1; j<edge.num_aa; j++)
{
PathPos& pos = template_path.positions[pos_idx++];
pos.breakage = NULL;
pos.aa = aas[j];
}
}
if (pos_idx != num_aas_in_multipath)
{
cout << "Error: mismatches between positions and multipath length!" << endl;
exit(1);
}
PathPos& last_pos = template_path.positions[num_positions-1];
const int last_e_idx = multi_path.edge_idxs[e-1];
const MultiEdge& last_edge = get_multi_edge(last_e_idx);
last_pos.breakage = last_edge.c_break;
last_pos.edge_idx =-1;
last_pos.mass = last_edge.c_break->mass;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -