⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 multipath.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 3 页
字号:
#include "PrmGraph.h"

struct pos_score_pair {
	pos_score_pair() : pos(NEG_INF), score(NEG_INF) {};
	bool operator< (const pos_score_pair& other) const
	{
		return (score > other.score);
	}
	int pos;
	score_t score;
};


// This struct uses a simple score based comparison
struct PathHeap {
public:
	void init(int size) 
	{ 
		heap_size = size; 
		paths.resize(heap_size); 
		score_pairs.resize(heap_size);
		int i;
		for (i=0; i<heap_size; i++)
		{
			score_pairs[i].pos = i;
			score_pairs[i].score = NEG_INF;
		}
	}

	score_t get_min_score() const { return score_pairs[0].score; }

	int get_num_real_entries() const
	{
		int i,n=0;
		for (i=0; i<score_pairs.size(); i++)
			if (score_pairs[i].score>NEG_INF)
				n++;
		return n;
	}

	void add_path(const SeqPath& path)
	{
		if (path.path_score<=score_pairs[0].score)
			return;
		
		const int pos = score_pairs[0].pos;
		paths[pos]=path;

		pop_heap(score_pairs.begin(),score_pairs.end());
		score_pairs[heap_size-1].pos = pos;
		score_pairs[heap_size-1].score = path.path_score;
		push_heap(score_pairs.begin(),score_pairs.end());
	}

	void sort_paths() { sort(paths.begin(),paths.end(),comp_SeqPath_path_score); }

	vector<SeqPath> get_paths() { return paths; }

private:
	int heap_size;

	vector<pos_score_pair> score_pairs;
	vector<SeqPath> paths;  // this is a min heap smallest score in front!
};






// add the variant ptr and scores for this combo
void PrmGraph::add_and_score_edge_variants(const AA_combo& aa_combo, MultiEdge& edge)
{
	const vector<int>& aa_positions = config->get_aa_positions();
	const bool reaches_n_term = (nodes[edge.n_idx].type == NODE_N_TERM);
	const bool reaches_c_term = (nodes[edge.c_idx].type == NODE_C_TERM);
	int v;
	
	int *variant_ptr = (int *)config->get_variant_ptr(aa_combo.variant_start_idx);

	for (v=0; v<aa_combo.num_variants; v++)
	{
		int num_aa = *variant_ptr;
		int *aas = variant_ptr+1;

		int i;
		for (i=0; i<num_aa; i++)
			if (aa_positions[*(aas+i)])
				break;

		// need to check that the variant is not violating any of the position restrictions
		// such as +1 -1 positions
		if (i<num_aa)
		{
			if (reaches_n_term)
			{
				int a;
				for (a=0; a<num_aa; a++)
				{
					int aa_idx = aas[a];
					if (aa_positions[aa_idx] != 0 && aa_positions[aa_idx] != a+1)
						break; 
				}
				if (a<num_aa)
				{
					variant_ptr+= num_aa +1;
					continue;
				}
			}
			else // check for +1 positions
			{
				int a;
				for (a=0; a<num_aa; a++)
				{
					int aa_idx = aas[a];
					if (aa_positions[aa_idx] == 1)
						break; 
				}
				if (a<num_aa)
				{
					variant_ptr+= num_aa +1;
					continue;
				}
			}

			if (reaches_c_term)
			{
				int a;
				for (a=0; a<num_aa; a++)
				{
					int aa_idx = aas[a];
					if (aa_positions[aa_idx] != 0 && aa_positions[aa_idx] != a-num_aa )
						break; 
				}
				if (a<num_aa) // found a problem with one of the aa positions
				{
					variant_ptr+= num_aa +1;
					continue;
				}
			}
			else // check for -1 positions
			{
				int a;
				for (a=0; a<num_aa; a++)
				{
					int aa_idx = aas[a];
					if (aa_positions[aa_idx] == -1)
						break; 
				}
				if (a<num_aa)
				{
					variant_ptr+= num_aa +1;
					continue;
				}
			}

			

		}
		
		score_t variant_score = calc_edge_variant_score(edge,num_aa,aas);

		if (variant_score>edge.max_variant_score)
			edge.max_variant_score = variant_score;

		edge.variant_ptrs.push_back(variant_ptr);
		edge.variant_scores.push_back(variant_score);
		edge.variant_probs.push_back(0);

		variant_ptr+= num_aa +1;
	}
}


// adds the relevant PathPos to the path and adjusts the other non-terminal values 
void SeqPath::add_edge_variant(const MultiEdge& edge, int e_idx, int variant_idx)
{
	PathPos new_pos;

	int *variant_ptr = edge.variant_ptrs[variant_idx];
	int num_aa = *variant_ptr++;
	int *aas = variant_ptr;

	if (num_aa != edge.num_aa)
	{
		cout << "Error: edge and variant mixup!" << endl;
		exit(1);
	}
	
	num_aa += edge.num_aa;
	
	new_pos.breakage = edge.n_break;
	new_pos.edge_idx = e_idx;
	new_pos.mass = edge.n_break->mass;
	new_pos.edge_variant_score = edge.variant_scores[variant_idx];
	new_pos.node_score = edge.n_break->score;
	new_pos.node_idx = edge.n_idx;
	new_pos.aa = aas[0];

	path_score += new_pos.edge_variant_score + new_pos.node_score;

	positions.push_back(new_pos);

	if (edge.num_aa == 1)
		return;

	int i;
	for (i=1; i<edge.num_aa; i++)
	{
		PathPos new_pos;

		new_pos.breakage = NULL;
		new_pos.aa = aas[i]; // the rest of the fields are initialized to the default (NULL) values
		positions.push_back(new_pos);
	}
	
}


/************************************************************************

  Expands the single multi path into all possible sequence variants.
  This is done incrementaly by expanding each multi edges

*************************************************************************/
void PrmGraph::expand_multi_path(const MultiPath& multi_path, 
								 vector<SeqPath>& seq_paths,
								 score_t min_score,
								 score_t forbidden_pair_penalty,
								 int max_num_paths) const
{
	const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos();
	vector<score_t> max_attainable_scores;

	const int num_edges = multi_path.edge_idxs.size();
	max_attainable_scores.resize(num_edges+1,0);

	int e;
	for (e=num_edges-1; e>=0; e--)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];

		max_attainable_scores[e]= max_attainable_scores[e+1] + 
			edge.max_variant_score + edge.c_break->score; 
	}

	if (min_score>multi_edges[multi_path.edge_idxs[0]].n_break->score + max_attainable_scores[0])
		return;

	int i;
	seq_paths.resize(1);

	seq_paths[0].n_term_aa = multi_path.n_term_aa;
	seq_paths[0].c_term_aa = multi_path.c_term_aa;
	seq_paths[0].n_term_mass = multi_path.n_term_mass;
	seq_paths[0].c_term_mass = multi_path.c_term_mass;
	seq_paths[0].multi_path_rank = multi_path.original_rank;
	seq_paths[0].path_score = 0;
	seq_paths[0].prm_ptr = (PrmGraph *)this;
	seq_paths[0].positions.clear();

	for (e=0; e<multi_path.edge_idxs.size(); e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];

		if (edge.get_num_variants() == 1)
		{
			const int * variant_ptr = edge.variant_ptrs[0];
			const int num_aa = *variant_ptr++;
			const int *aas = variant_ptr;
		
			int i;
			for (i=0; i<seq_paths.size(); i++)
			{
				if (seq_paths[i].path_score>NEG_INF && 
					seq_paths[i].path_score + max_attainable_scores[e] > min_score)
				{
					seq_paths[i].add_edge_variant(edge,e_idx,0);
				}
				else
					seq_paths[i].path_score = NEG_INF;
			}
		}
		else
		{
			vector<SeqPath> old_paths = seq_paths;
			seq_paths.resize(seq_paths.size()*edge.get_num_variants());

			int v;
			int idx=0;
			for (v=0; v<edge.get_num_variants(); v++)
			{
				int i;
				for (i=0; i<old_paths.size(); i++)
					seq_paths[idx++]=old_paths[i];
			}

			
			idx=0;
			for (v=0; v<edge.get_num_variants(); v++)
			{
				const int * variant_ptr =  edge.variant_ptrs[v];
				const int num_aa = *variant_ptr++;
				const int *aas = variant_ptr;

				int i;
				for (i=0; i<old_paths.size(); i++)
				{
					if (seq_paths[idx].path_score>NEG_INF &&
						seq_paths[idx].path_score + max_attainable_scores[e] > min_score)
					{
						seq_paths[idx].add_edge_variant(edge,e_idx,v);
					}
					else
						seq_paths[idx].path_score=NEG_INF;
					idx++;
				}
			}	
		}

		
		// if too many paths are here sort and pop back
		if (max_num_paths>0 && seq_paths.size()>max_num_paths)
		{
			sort(seq_paths.begin(),seq_paths.end(),comp_SeqPath_path_score);
			while (seq_paths.size()>max_num_paths && seq_paths[seq_paths.size()-1].path_score == NEG_INF)
				seq_paths.pop_back();
		}
		else // remove all seq paths with NEG_INF score
		{
			int last_idx = seq_paths.size()-1;
			int i;
			for (i=0; i<last_idx; i++)
			{
				while (last_idx>i && seq_paths[last_idx].path_score == NEG_INF)
					last_idx--;
				
				if (seq_paths[i].path_score == NEG_INF)
				{
					seq_paths[i]=seq_paths[last_idx];
					seq_paths[last_idx].path_score = NEG_INF;
				}
			}

			while (seq_paths.size()>0 && seq_paths[seq_paths.size()-1].path_score == NEG_INF)
				seq_paths.pop_back();
		}
	}

	const MultiEdge& last_edge = multi_edges[multi_path.edge_idxs[multi_path.edge_idxs.size()-1]];
	PathPos last_pos;

	last_pos.breakage = last_edge.c_break;
	last_pos.edge_idx =-1;
	last_pos.mass = last_edge.c_break->mass;
	last_pos.node_idx = last_edge.c_idx;
	last_pos.node_score = last_edge.c_break->score;


	for (i=0; i<seq_paths.size(); i++)
	{
		SeqPath& path= seq_paths[i];
		path.path_score += last_pos.node_score;
		path.num_forbidden_nodes = multi_path.num_forbidden_nodes;
		path.path_score -= (forbidden_pair_penalty * path.num_forbidden_nodes);
		path.positions.push_back(last_pos);
		path.make_seq_str(config);
	}
}

struct var_combo {
	var_combo() : path_score(0) {};
	bool operator< (const var_combo& other) const
	{
		return (path_score>other.path_score);
	}

	vector<int> var_idxs;
	vector<score_t> node_scores; // dim = #var_idxs+1
	score_t path_score;
};






/************************************************************************

  Expands the single multi path into all possible sequence variants.
  Since this turns out to be the time-limiting process for long de nvoo,
  this is implemented using a quick branch and bound that works on
  edge variant indices. The SeqPaths are created only for the final
  set of sequences.

*************************************************************************/
void PrmGraph::fast_expand_multi_path(const MultiPath& multi_path, 
									  vector<SeqPath>& seq_paths,
									  score_t min_score,
									  score_t forbidden_pair_penalty,
									  int max_num_paths) const
{
	const vector<AA_combo>& aa_edge_comobos = config->get_aa_edge_combos();
	vector<score_t> max_attainable_scores;

	const int num_edges = multi_path.edge_idxs.size();
	max_attainable_scores.resize(num_edges+1,0);
	
	int num_path_variants=1;
	int e;
	for (e=num_edges-1; e>=0; e--)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];

		max_attainable_scores[e]= max_attainable_scores[e+1] + 
			edge.max_variant_score + edge.c_break->score; 
		num_path_variants *= edge.variant_ptrs.size();
	}

	const score_t max_path_score =  multi_edges[multi_path.edge_idxs[0]].n_break->score + 
									max_attainable_scores[0];
	const score_t min_delta_allowed = min_score - max_path_score;

	if (min_delta_allowed>0)
		return; // this multipath won't help much

	if (num_path_variants == 0)
	{
		cout << "Error: had an edge with 0 variants!" <<endl;
		exit(1);
	}

	// perform expansion using a heap and condensed path representation
	vector<int> var_positions;				   // holds the edge idxs
	vector<int> num_vars;
	vector<int> var_edge_positions;
	vector< vector< score_t > > var_scores;
	var_scores.clear();
	var_positions.clear();
	num_vars.clear();
	int num_aas_in_multipath = 0;
	for (e=0; e<num_edges; e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = multi_edges[e_idx];
		const int num_vars_in_edge = edge.variant_ptrs.size();
		vector<score_t> scores;

		num_aas_in_multipath+=edge.num_aa;

		if (num_vars_in_edge>1)
		{
			int i;
			for (i=0; i<num_vars_in_edge; i++)
				scores.push_back(edge.variant_scores[i]);
		
			var_scores.push_back(scores);
			var_positions.push_back(e);
			num_vars.push_back(num_vars_in_edge);
			var_edge_positions.push_back(num_aas_in_multipath-edge.num_aa);

		//	cout << e << "\t" << e_idx << "\t" << num_vars_in_edge << "\t" << edge.num_aa << endl;
		}	
	}

		// create the SeqPaths from the edge_combos...
	const int num_positions = num_aas_in_multipath+1;
	SeqPath template_path;				// holds common elements to all paths
	template_path.n_term_aa = multi_path.n_term_aa;
	template_path.c_term_aa = multi_path.c_term_aa;
	template_path.n_term_mass = multi_path.n_term_mass;
	template_path.c_term_mass = multi_path.c_term_mass;
	template_path.multi_path_rank = multi_path.original_rank;
	template_path.path_score = 0;
	template_path.prm_ptr = (PrmGraph *)this;
	template_path.positions.clear();
	template_path.positions.resize(num_positions);

	template_path.num_forbidden_nodes = multi_path.num_forbidden_nodes;
	template_path.path_score -= template_path.num_forbidden_nodes * forbidden_pair_penalty;

	int pos_idx=0;
	for (e=0; e<multi_path.edge_idxs.size(); e++)
	{
		const int e_idx = multi_path.edge_idxs[e];
		const MultiEdge& edge = get_multi_edge(e_idx);
		PathPos& pos = template_path.positions[pos_idx++];

		pos.breakage = edge.n_break;
		pos.edge_idx = e_idx;
		pos.mass =  edge.n_break->mass;
		pos.node_score = edge.n_break->score;
		pos.node_idx = edge.n_idx;
		template_path.path_score += pos.node_score;

		int *variant_ptr = edge.variant_ptrs[0];
		const int num_aa = *variant_ptr++;
		int *aas = variant_ptr;

		if (edge.variant_ptrs.size() == 1)
		{
			pos.edge_variant_score = edge.variant_scores[0];
			pos.aa = aas[0];
			template_path.path_score += pos.edge_variant_score;
		}

		if (edge.num_aa == 1)
			continue;
		
		int j;
		for (j=1; j<edge.num_aa; j++)
		{
			PathPos& pos = template_path.positions[pos_idx++];
			pos.breakage = NULL;
			pos.aa = aas[j];
		}
	}

	if (pos_idx != num_aas_in_multipath)
	{
		cout << "Error: mismatches between positions and multipath length!" << endl;
		exit(1);
	}
	
	PathPos& last_pos = template_path.positions[num_positions-1];
	const int last_e_idx = multi_path.edge_idxs[e-1];
	const MultiEdge& last_edge = get_multi_edge(last_e_idx);
	last_pos.breakage = last_edge.c_break;
	last_pos.edge_idx =-1;
	last_pos.mass = last_edge.c_break->mass;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -