denovodp.cpp
来自「MS-Clustering is designed to rapidly clu」· C++ 代码 · 共 1,254 行 · 第 1/3 页
CPP
1,254 行
#include "DeNovoDp.h"
#include "AnnotatedSpectrum.h"
#include "auxfun.h"
struct dis_pair {
dis_pair() : dis(0), left_idx(-1), right_idx(-1) {};
bool operator< (dis_pair& other)
{
return dis<other.dis;
}
bool operator< (dis_pair& other) const
{
return dis<other.dis;
}
bool operator< (const dis_pair& other)
{
return dis<other.dis;
}
bool operator< (const dis_pair& other) const
{
return dis<other.dis;
}
mass_t dis;
int left_idx,right_idx;
};
// fills in all cells in the dp_table according to the PrmGraph
void DeNovoDp::fill_dp_table(const PrmGraph *_prm, score_t sym_penalty)
{
prm = (PrmGraph *)_prm;
config=prm->config;
const vector<Node>& nodes = prm->nodes;
const vector<MultiEdge>& edges = prm->multi_edges;
const int num_nodes = nodes.size();
// forbidden window of 1 Daltons
const mass_t pm_with_19 = prm->pm_with_19;
const mass_t min_forbidden_sum=pm_with_19 - MASS_PROTON - config->get_tolerance()*1.5;
const mass_t max_forbidden_sum=pm_with_19 - MASS_PROTON + config->get_tolerance()*1.5;
const mass_t sym_axis = (pm_with_19 - 1.0) *0.5;
int i,j;
cells.clear();
cells.resize(num_nodes);
for (i=0; i<num_nodes; i++)
cells[i].resize(num_nodes);
vector<bool> skip_ind;
skip_ind.resize(num_nodes,false);
for (i=0; i<num_nodes; i++)
if (nodes[i].in_edge_idxs.size() == 0 &&
nodes[i].out_edge_idxs.size() == 0)
skip_ind[i]=true;
forbidden_idxs.resize(num_nodes,-1);
vector<dis_pair> pairs;
for (i=0; i<num_nodes-1; i++)
for (j=i+1; j<num_nodes; j++)
{
dis_pair p;
p.dis = nodes[j].mass - nodes[i].mass;
p.left_idx=i;
p.right_idx=j;
if (p.dis< 56.0)
continue;
pairs.push_back(p);
// mark forbidden pairs
mass_t sum = nodes[j].mass + nodes[i].mass;
if (sum>min_forbidden_sum && sum<max_forbidden_sum)
{
cells[i][j].is_forbidden=1;
// cout << "F: " << i << " " << j << endl;
if (forbidden_idxs[j]<0)
{
forbidden_idxs[j]=i;
forbidden_idxs[i]=j;
}
}
}
prm->set_frobidden_idxs(forbidden_idxs);
sort(pairs.begin(),pairs.end());
// init diagonal
for (i=0; i<num_nodes; i++)
cells[i][i].score = nodes[i].score;
// fill other cells
for (i=0; i<pairs.size(); i++)
{
const int left_idx = pairs[i].left_idx;
const int right_idx = pairs[i].right_idx;
if (sym_axis - nodes[left_idx].mass > nodes[right_idx].mass - sym_axis)
{
int j;
for (j=0; j<nodes[left_idx].out_edge_idxs.size(); j++)
{
int e_idx = nodes[left_idx].out_edge_idxs[j];
score_t s = nodes[left_idx].score + edges[e_idx].max_variant_score +
cells[edges[e_idx].c_idx][right_idx].score;
if (s > cells[left_idx][right_idx].score)
{
cells[left_idx][right_idx].score = s;
cells[left_idx][right_idx].prev_edge_idx = e_idx;
}
}
}
else // fill the right side
{
int j;
for (j=0; j<nodes[right_idx].in_edge_idxs.size(); j++)
{
int e_idx = nodes[right_idx].in_edge_idxs[j];
score_t s = nodes[right_idx].score + edges[e_idx].max_variant_score+
cells[left_idx][edges[e_idx].n_idx].score;
if (s > cells[left_idx][right_idx].score)
{
cells[left_idx][right_idx].score = s;
cells[left_idx][right_idx].prev_edge_idx = e_idx;
}
}
}
if (cells[left_idx][right_idx].is_forbidden)
cells[left_idx][right_idx].score -= sym_penalty;
}
}
MultiPath DeNovoDp::get_top_scoring_antisymetric_path( score_t sym_penalty) const
{
int i,j;
score_t max_score =NEG_INF;
int best_left=-1, best_right=-1;
for (i=0; i<cells.size()-1; i++)
for (j=i+1; j<cells.size(); j++)
if (cells[i][j].score > max_score)
{
best_left=i;
best_right=j;
max_score = cells[i][j].score;
}
MultiPath ret_path;
if (max_score<0)
return ret_path;
// collect edges and create path
vector<int> path_edges;
int l=best_left, r=best_right;
while (cells[l][r].prev_edge_idx>=0)
{
int e_idx = cells[l][r].prev_edge_idx;
path_edges.push_back(e_idx);
if (prm->multi_edges[e_idx].c_idx==r)
{
r = prm->multi_edges[e_idx].n_idx;
}
else
{
l = prm->multi_edges[e_idx].c_idx;
}
}
prm->create_path_from_edges(path_edges,ret_path);
ret_path.path_score = max_score;
return ret_path;
}
struct edge_idx_set {
edge_idx_set() : score(NEG_INF), length(0), num_aa(0) {};
bool operator< (const edge_idx_set& other) const
{
return score > other.score;
}
edge_idx_set& operator= (const edge_idx_set& e)
{
score = e.score;
length = e.length;
memcpy(edge_idxs,e.edge_idxs,length*sizeof(int));
return *this;
}
bool is_good_prefix(const vector<int>& good_idxs) const
{
int i;
for (i=0; i<length; i++)
{
int j;
for (j=0; j<good_idxs.size(); j++)
if (edge_idxs[i]==good_idxs[j])
break;
if (j==good_idxs.size())
return false;
}
return true;
}
void print()
{
cout << "P " << num_aa << "\t" << score << endl;
int i;
for (i=0; i<length; i++)
cout << i << "\t" << edge_idxs[i] << endl;
cout << endl;
}
score_t score;
int length;
int num_aa; // number of aas used to fill these edges
int edge_idxs[32]; // max peptide size...
};
/****************************************************************************
Computes for each node and number of amino acids, what is the maximal
score attainable by making X steps forward from that node.
Finds these values by performing a BFS search from all nodes.
Cell i,j holds the vlaue for num steps i , node j.
*****************************************************************************/
void DeNovoDp::find_max_gains_per_length(int max_length,
vector< vector< score_t > >& max_gains,
bool verbose) const
{
const int num_nodes = prm->get_num_nodes();
const vector<Node>& nodes = prm->get_nodes();
const vector<MultiEdge>& edges = prm->get_multi_edges();
int n,i;
max_gains.resize(num_nodes);
for (n=0; n<num_nodes; n++)
{
max_gains[n].resize(max_length+1,NEG_INF);
max_gains[n][0]=0;
}
for (i=1; i<=max_length; i++)
{
int n;
for (n=0; n<num_nodes; n++)
{
const Node& node = nodes[n];
const vector<int>& out_idxs = node.out_edge_idxs;
int j;
for (j=0; j<out_idxs.size(); j++)
{
const MultiEdge& edge = edges[out_idxs[j]];
const int num_aa = edge.num_aa;
const int c_idx = edge.c_idx;
if (num_aa>i)
continue;
const score_t new_score = max_gains[c_idx][i-num_aa] + nodes[c_idx].score + edge.max_variant_score;
if (new_score > max_gains[n][i])
max_gains[n][i]=new_score;
}
}
}
for (n=0; n<num_nodes; n++)
{
score_t max=NEG_INF;
int max_idx = 0;
for (i=0; i<=max_length; i++)
if (max_gains[n][i]>max)
{
max=max_gains[n][i];
max_idx=i;
}
for (i=max_idx +1; i<=max_length; i++)
max_gains[n][i]=max;
}
/* max_gains.resize(max_length+1);
for (i=0; i<max_gains.size(); i++)
max_gains[i].resize(num_nodes,NEG_INF);
for (n=0; n<num_nodes; n++)
max_gains[0][n]=0;
for (n=num_nodes-1; n>=0; n--)
{
if (nodes[n].score<=NEG_INF)
continue;
const Node& node = nodes[n];
const vector<int>& out_idxs = node.out_edge_idxs;
int j;
for (j=0; j<out_idxs.size(); j++)
{
const MultiEdge& edge = edges[out_idxs[j]];
// loop over different number of amino acids in edge
const int num_aa = edge.num_aa;
const int c_idx = edge.c_idx;
const score_t add_score = nodes[c_idx].score + edge.max_variant_score;
int k;
for (k=0; k<max_length-num_aa; k++)
if (max_gains[k][c_idx]>NEG_INF && max_gains[k][c_idx]+add_score>max_gains[k+num_aa][n])
max_gains[k+num_aa][n]=max_gains[k][c_idx]+add_score;
}
score_t max=NEG_INF;
int max_idx = 0;
for (i=1; i<=max_length; i++)
if (max_gains[i][n]>max)
{
max=max_gains[i][n];
max_idx=i;
}
for (i=max_idx +1; i<=max_length; i++)
max_gains[i][n]=max;
} */
if (verbose)
{
cout << "MAX GAINS: " << endl;
int n;
for (n=0; n<num_nodes; n++)
{
cout << left << setw(4) << n << " ";
for (i=0; i<=max_length; i++)
cout << setw(5) << (max_gains[n][i]>-1000 ? max_gains[n][i] : -999) << " ";
cout << endl;
}
cout << endl;
}
}
/*******************************************************************************
Returns all the top scoring paths, uses a DFS search with branch and bound pruning.
limits the length of the solution to be between (approximately) supplied bounds.
************************************************************************************/
void DeNovoDp::get_top_scoring_antisymetric_paths_with_length_limits(
vector<MultiPath>& multi_paths,
int required_num_paths,
int min_length,
int max_length,
score_t sym_penalty,
score_t min_score_needed,
bool try_complete_sequences,
bool only_complete_sequences,
double half_life_time) const
{
const vector<Node>& nodes = prm->nodes;
const vector<MultiEdge>& multi_edges = prm->multi_edges;
const int num_nodes = nodes.size();
const int last_node_idx = num_nodes-1;
int last_heap_pos = required_num_paths - 1;
int last_alt_heap_pos = required_num_paths - 1;
const score_t non_complete_penalty = 0; // this is a penalty added for each end of the peptide
// that does not reach the terminal. Used to bias // against sequences that do not read ends.
vector<edge_idx_set> heap, // heap is used to store the paths. If running time takes too long,
// we start reducing its size so the prunning becomes more efficient
// in such a case, we start putting removed paths into the alt_heap
// so we don't lose good paths
alt_heap;
vector< vector<score_t> > max_gains_for_length; // the maximal score attainable from each node
// using a given number of amino acids.
// length, node_idx
vector< int > node_ordering; // holds the optimal order of nodes for the BB search
vector<score_t> added_scores; // holds for each depth in the tree, the score that was
// added by using the edges in the current path
vector<int> out_idx_counters; // for each node, what branch are we going down
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?