denovodp.cpp

来自「MS-Clustering is designed to rapidly clu」· C++ 代码 · 共 1,254 行 · 第 1/3 页
CPP
1,254 行
#include "DeNovoDp.h"
#include "AnnotatedSpectrum.h"
#include "auxfun.h" 

struct dis_pair  {
	dis_pair() : dis(0), left_idx(-1), right_idx(-1) {};
	
	bool operator< (dis_pair& other)
	{
		return dis<other.dis;
	}

	bool operator< (dis_pair& other) const
	{
		return dis<other.dis;
	} 

	bool operator< (const dis_pair& other)
	{
		return dis<other.dis;
	}

	bool operator< (const dis_pair& other) const
	{
		return dis<other.dis;
	}

	mass_t dis;
	
	int left_idx,right_idx;
};




// fills in all cells in the dp_table according to the PrmGraph
void DeNovoDp::fill_dp_table(const PrmGraph *_prm, score_t sym_penalty)
{
	prm = (PrmGraph *)_prm;
	config=prm->config;
	const vector<Node>& nodes = prm->nodes;
	const vector<MultiEdge>& edges = prm->multi_edges;
	const int num_nodes = nodes.size();

	// forbidden window of 1 Daltons
	const mass_t pm_with_19 = prm->pm_with_19;
	const mass_t min_forbidden_sum=pm_with_19 - MASS_PROTON - config->get_tolerance()*1.5;
	const mass_t max_forbidden_sum=pm_with_19 - MASS_PROTON + config->get_tolerance()*1.5;
	const mass_t sym_axis = (pm_with_19 - 1.0) *0.5;

	int i,j;
	cells.clear();
	cells.resize(num_nodes);
	for (i=0; i<num_nodes; i++)
		cells[i].resize(num_nodes);

	vector<bool> skip_ind;
	skip_ind.resize(num_nodes,false);
	for (i=0; i<num_nodes; i++)
		if (nodes[i].in_edge_idxs.size()  == 0 && 
			nodes[i].out_edge_idxs.size() == 0)
			skip_ind[i]=true;

	forbidden_idxs.resize(num_nodes,-1);
	vector<dis_pair> pairs;
	for (i=0; i<num_nodes-1; i++)
		for (j=i+1; j<num_nodes; j++)
		{
			dis_pair p;
			p.dis = nodes[j].mass - nodes[i].mass;
			p.left_idx=i;
			p.right_idx=j;

			if (p.dis< 56.0)
				continue;

			pairs.push_back(p);

			// mark forbidden pairs
			mass_t sum = nodes[j].mass + nodes[i].mass;
			if (sum>min_forbidden_sum && sum<max_forbidden_sum)
			{
				cells[i][j].is_forbidden=1;

//				cout << "F: " << i << " " << j << endl;

				if (forbidden_idxs[j]<0)
				{
					forbidden_idxs[j]=i;
					forbidden_idxs[i]=j;
				}
			}
		}

	prm->set_frobidden_idxs(forbidden_idxs);

	sort(pairs.begin(),pairs.end());

	// init diagonal
	for (i=0; i<num_nodes; i++)
		cells[i][i].score = nodes[i].score;

	// fill other cells
	for (i=0; i<pairs.size(); i++)
	{
		const int left_idx = pairs[i].left_idx;
		const int right_idx = pairs[i].right_idx;

		if (sym_axis - nodes[left_idx].mass > nodes[right_idx].mass - sym_axis)
		{
			int j;
			for (j=0; j<nodes[left_idx].out_edge_idxs.size(); j++)
			{
				int e_idx = nodes[left_idx].out_edge_idxs[j];
				score_t s = nodes[left_idx].score + edges[e_idx].max_variant_score + 
							cells[edges[e_idx].c_idx][right_idx].score;
				if (s > cells[left_idx][right_idx].score)
				{
					cells[left_idx][right_idx].score = s;
					cells[left_idx][right_idx].prev_edge_idx = e_idx;
				}
			}
		}
		else  // fill the right side
		{
			int j;
			for (j=0; j<nodes[right_idx].in_edge_idxs.size(); j++)
			{
				int e_idx = nodes[right_idx].in_edge_idxs[j];
				score_t s = nodes[right_idx].score + edges[e_idx].max_variant_score+ 
							cells[left_idx][edges[e_idx].n_idx].score;
				if (s > cells[left_idx][right_idx].score)
				{
					cells[left_idx][right_idx].score = s;
					cells[left_idx][right_idx].prev_edge_idx = e_idx;
				}
			}
		}

		if (cells[left_idx][right_idx].is_forbidden)
			cells[left_idx][right_idx].score -= sym_penalty;
	}


	
}



MultiPath DeNovoDp::get_top_scoring_antisymetric_path( score_t sym_penalty) const
{
	int i,j;
	score_t max_score =NEG_INF;
	int best_left=-1, best_right=-1;

	for (i=0; i<cells.size()-1; i++)
		for (j=i+1; j<cells.size(); j++)
			if (cells[i][j].score > max_score)
			{
				best_left=i;
				best_right=j;
				max_score = cells[i][j].score;
			}

	MultiPath ret_path;
	if (max_score<0)
		return ret_path;

	// collect edges and create path
	vector<int> path_edges;
	int l=best_left, r=best_right;
	while (cells[l][r].prev_edge_idx>=0)
	{
		int e_idx = cells[l][r].prev_edge_idx;
		path_edges.push_back(e_idx);

		if (prm->multi_edges[e_idx].c_idx==r)
		{
			r = prm->multi_edges[e_idx].n_idx;
		}
		else
		{
			l = prm->multi_edges[e_idx].c_idx;
		}
	}
	
	prm->create_path_from_edges(path_edges,ret_path);
	ret_path.path_score = max_score;

	return ret_path;
}




struct edge_idx_set {
	edge_idx_set() : score(NEG_INF), length(0), num_aa(0) {};

	bool operator< (const edge_idx_set& other) const
	{
		return score > other.score;
	}

	edge_idx_set& operator= (const edge_idx_set& e)
	{
		score = e.score;
		length = e.length;
		memcpy(edge_idxs,e.edge_idxs,length*sizeof(int));

		return *this;
	}

	bool is_good_prefix(const vector<int>& good_idxs) const
	{
		
		int i;
		for (i=0; i<length; i++)
		{
			int j;
			for (j=0; j<good_idxs.size(); j++)
				if (edge_idxs[i]==good_idxs[j])
					break;
			if (j==good_idxs.size())
				return false;
		}
		return true;
	}

	void print()
	{
		cout << "P " << num_aa << "\t" << score << endl;
		int i;
		for (i=0; i<length; i++)
			cout << i << "\t" << edge_idxs[i] << endl;
		cout << endl;
	}

	score_t score;
	int length;
	int num_aa; // number of aas used to fill these edges
	int edge_idxs[32]; // max peptide size...
};




/****************************************************************************
   Computes for each node and number of amino acids, what is the maximal
   score attainable by making X steps forward from that node.
   Finds these values by performing a BFS search from all nodes.
   Cell i,j holds the vlaue for num steps i , node j.
*****************************************************************************/
void DeNovoDp::find_max_gains_per_length(int max_length, 
										 vector< vector< score_t > >& max_gains, 
										 bool verbose) const
{
	const int num_nodes = prm->get_num_nodes();
	const vector<Node>& nodes = prm->get_nodes();
	const vector<MultiEdge>& edges = prm->get_multi_edges();
	int n,i;

	max_gains.resize(num_nodes);
	for (n=0; n<num_nodes; n++)
	{
		max_gains[n].resize(max_length+1,NEG_INF);
		max_gains[n][0]=0;
	}


	for (i=1; i<=max_length; i++)
	{
		int n;
		for (n=0; n<num_nodes; n++)
		{
			const Node& node = nodes[n];
			const vector<int>& out_idxs = node.out_edge_idxs;
			int j;
			for (j=0; j<out_idxs.size(); j++)
			{
				const MultiEdge& edge = edges[out_idxs[j]];
				const int num_aa = edge.num_aa;
				const int c_idx  = edge.c_idx;
				if (num_aa>i)
					continue;

				const score_t new_score = max_gains[c_idx][i-num_aa] + nodes[c_idx].score + edge.max_variant_score;

				if (new_score > max_gains[n][i])
					max_gains[n][i]=new_score;
			}
		}
	}

	for (n=0; n<num_nodes; n++)
	{
		score_t max=NEG_INF;
		int max_idx = 0;
		for (i=0; i<=max_length; i++)
			if (max_gains[n][i]>max)
			{
				max=max_gains[n][i];
				max_idx=i;
			}

		for (i=max_idx +1; i<=max_length; i++)
			max_gains[n][i]=max;
	}



/*	max_gains.resize(max_length+1);
	for (i=0; i<max_gains.size(); i++)
		max_gains[i].resize(num_nodes,NEG_INF);

	for (n=0; n<num_nodes; n++)
		max_gains[0][n]=0;

	for (n=num_nodes-1; n>=0; n--)
	{
		if (nodes[n].score<=NEG_INF)
			continue;

		const Node& node = nodes[n];
		const vector<int>& out_idxs = node.out_edge_idxs;
		int j;


		for (j=0; j<out_idxs.size(); j++)
		{
			const MultiEdge& edge = edges[out_idxs[j]];

			// loop over different number of amino acids in edge
			const int num_aa = edge.num_aa;
			const int c_idx  = edge.c_idx;
			const score_t add_score = nodes[c_idx].score + edge.max_variant_score;

			int k;
			for (k=0; k<max_length-num_aa; k++)
				if (max_gains[k][c_idx]>NEG_INF && max_gains[k][c_idx]+add_score>max_gains[k+num_aa][n])
					max_gains[k+num_aa][n]=max_gains[k][c_idx]+add_score;
		}

		score_t max=NEG_INF;
		int max_idx = 0;
		for (i=1; i<=max_length; i++)
			if (max_gains[i][n]>max)
			{
				max=max_gains[i][n];
				max_idx=i;
			}

		for (i=max_idx +1; i<=max_length; i++)
			max_gains[i][n]=max;
	} */


	if (verbose)
	{
		cout << "MAX GAINS: " << endl;
		int n;
		for (n=0; n<num_nodes; n++)
		{
			cout << left << setw(4) << n << "  ";
			for (i=0; i<=max_length; i++)	
				cout << setw(5) << (max_gains[n][i]>-1000 ? max_gains[n][i] : -999) << " ";
			cout << endl;
		}
		cout << endl;
	}
}




/*******************************************************************************
 Returns all the top scoring paths, uses a DFS search with branch and bound pruning.
 limits the length of the solution to be between (approximately) supplied bounds.
************************************************************************************/
void DeNovoDp::get_top_scoring_antisymetric_paths_with_length_limits(
					vector<MultiPath>& multi_paths, 
					int required_num_paths, 
					int min_length,
					int max_length,
					score_t sym_penalty,
					score_t min_score_needed,
					bool try_complete_sequences,
					bool only_complete_sequences,
					double half_life_time) const
{
	const vector<Node>& nodes = prm->nodes;
	const vector<MultiEdge>& multi_edges = prm->multi_edges;
	const int num_nodes = nodes.size();
	const int last_node_idx = num_nodes-1;

	int last_heap_pos = required_num_paths - 1;
	int last_alt_heap_pos = required_num_paths - 1;


	const score_t non_complete_penalty = 0; // this is a penalty added for each end of the peptide
											  // that does not reach the terminal. Used to bias 										     // against sequences that do not read ends.
							
	vector<edge_idx_set> heap,		// heap is used to store the paths. If running time takes too long,
									// we start reducing its size so the  prunning becomes more efficient
									// in such a case, we start putting removed paths into the alt_heap
									// so we don't lose good paths
						 alt_heap;
	
	vector< vector<score_t> > max_gains_for_length; // the maximal score attainable from each node
										            // using a given number of amino acids.
												    // length, node_idx

	vector< int > node_ordering; // holds the optimal order of nodes for the BB search

	vector<score_t> added_scores;  // holds for each depth in the tree, the score that was
								   // added by using the edges in the current path          

	vector<int> out_idx_counters; // for each node, what branch are we going down
denovodp.cpp - 源码说明

本页面展示了「MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr」中的 denovodp.cpp 源码文件，采用 C++ 编程语言编写，共 1,254 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与质谱分析相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?