📄 prmgraph.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:


	int i;
	for (i=0; i<num_nodes; i++)
	{
		Node& node = nodes[i];
		if (node.type == NODE_C_TERM)
			continue;

		const mass_t& node_mass = node.mass;
		
		int current_e_idx = -1;
		int current_c_idx = -1;

		int last_node_idx=i;
		
		int c;
		for (c=0; c<num_combos; c++)
		{
			const int& combo_idx = double_edge_combo_idxs[c];
			const AA_combo& combo = aa_edge_combos[combo_idx];
			const mass_t& combo_mass = combo.total_mass;
	
			// should this combo be check - if there is a single edge with one of the
			// amino acids in the combo skip this combo
			bool is_overlap_edge = false;
			int k;
			for (k=0; k<combo.num_aa; k++)
				if (out_aa_ind[combo.amino_acids[k]][i])
					break;
			if (k<combo.num_aa)
			{
				const int& overlap_aa = combo.amino_acids[k];
				if (i>0 && ! add_overlap_edges && 
					overlap_aa != Pro && overlap_aa != Gly && overlap_aa != His &&  overlap_aa != Ser)
					continue;
				is_overlap_edge = true;
			}

			const mass_t exp_connect_mass = node_mass + combo_mass;
			const mass_t min_connect_mass1 = exp_connect_mass - half_tolerance;
			const mass_t max_connect_mass1 = exp_connect_mass + half_tolerance;

			int n_idx;
			score_t max_score = NEG_INF;
			int     max_idx   = -1;
			for (n_idx = last_node_idx+1; n_idx<num_nodes; n_idx++)
			{
				const mass_t& next_node_mass = nodes[n_idx].mass;
				
				if (next_node_mass<min_connect_mass1)
					continue;

				if (next_node_mass>max_connect_mass1)
					break;

				if (nodes[n_idx].score>max_score)
				{
					max_idx = n_idx;
					max_score = nodes[n_idx].score;
				}
			}

			// if couldn't connect with small tolerance, try larger margin
			if (max_idx<0)
			{
				const mass_t min_connect_mass2 = exp_connect_mass - digest_tolerance;
				const mass_t max_connect_mass2 = exp_connect_mass + digest_tolerance;

				int n_idx;
				for (n_idx = last_node_idx+1; n_idx<num_nodes; n_idx++)
				{
					const mass_t& next_node_mass = nodes[n_idx].mass;
					
					if (next_node_mass<min_connect_mass2)
					{
						last_node_idx++;
						continue;
					}

					if (next_node_mass>max_connect_mass2)
						break;

					if (nodes[n_idx].score>max_score)
					{
						max_idx = n_idx;
						max_score = nodes[n_idx].score;
					}
				}
			}


			if (max_score>NEG_INF)
			{
				// need to check if distance between peaks is within tolerance
				mass_t min_diff = fabs(nodes[max_idx].mass-exp_connect_mass);

				// try seeing if there are peaks that can bridge them
				if (min_diff>tolerance)
				{
					const vector<BreakageFragment>& fragments1 = nodes[i].breakage.fragments;
					const vector<BreakageFragment>& fragments2 = nodes[max_idx].breakage.fragments;

					int f;
					for (f=0; f<strong_fragment_idxs.size(); f++)
					{
						const int& strong_frag_idx = strong_fragment_idxs[f];
						const int pos1 = nodes[i].breakage.get_position_of_frag_idx(strong_frag_idx);
						if (pos1<0)
							continue;

						const int pos2 = nodes[max_idx].breakage.get_position_of_frag_idx(strong_frag_idx);
						if (pos2<0)
							continue;

						mass_t mass_diff = fabs(fragments1[pos1].mass - fragments2[pos2].mass);
						
						const int frag_charge = config->get_fragment(strong_frag_idx).charge;
						if (frag_charge>1)
							mass_diff *= frag_charge;

						mass_t diff = fabs(mass_diff - combo_mass);

						if (diff < min_diff)
							min_diff = diff;
					}
				}

				// larger tolerance is allowed for the digest and terminal nodes because they can be
				// created relative to the terminal node
				if (nodes[max_idx].type == NODE_DIGEST || nodes[i].type == NODE_DIGEST)
				{
					if (min_diff>digest_tolerance)
							continue;
				}
				else
				if (nodes[max_idx].type == NODE_C_TERM || nodes[i].type == NODE_N_TERM )
				{
					if (min_diff>digest_tolerance)
						continue;
				}
				else
					if (min_diff>tolerance)
						continue;
				
				// add combo idx
				if (current_c_idx != max_idx)
				{
					current_e_idx = find_edge_idx_ij(i,max_idx);
					current_c_idx = max_idx;
				}

				// check that the node in the middle is not too good
				if (is_overlap_edge)
				{
				}

				if (current_e_idx>=0 && multi_edges[current_e_idx].num_aa == 2)
				{
					MultiEdge& edge = multi_edges[current_e_idx];
					
					add_and_score_edge_variants(combo,edge);

					if (is_overlap_edge)
						edge.ind_edge_overlaps = true;

				}
				else // create new edge
				{
					MultiEdge new_edge;
					
					new_edge.n_idx = i;
					new_edge.c_idx = max_idx;
					new_edge.n_break = &nodes[i].breakage;
					new_edge.c_break = &nodes[max_idx].breakage;
					new_edge.num_aa= 2;
					new_edge.ind_edge_overlaps=is_overlap_edge;

					add_and_score_edge_variants(combo,new_edge);
					if (new_edge.variant_ptrs.size() == 0)
						continue;

					current_e_idx = multi_edges.size();
					nodes[i].out_edge_idxs.push_back(current_e_idx);
					nodes[max_idx].in_edge_idxs.push_back(current_e_idx);
					multi_edges.push_back(new_edge);
				}
			}
		}
	}
}


/********************************************************************
	Fills in long edges (similar to the double function, only looks both
	ath the out edges of node i and the in edges of node max_idx to see
	if there is a possible overlap).
	Uses a precomputed list (from config) of aa comobos and masses to quickly
	determine if certain edges are present.
	Flag add_overlap_edges controls if to add a double edge when the 
	same edge can be constructed by single edges.
*********************************************************************/
void PrmGraph::fill_longer_multi_edges(int edge_length, bool add_overlap_edges)
{
	const vector<AA_combo>& aa_edge_combos    = config->get_aa_edge_combos();
	const vector<int>& edge_combo_idxs = config->get_combo_idxs_by_length(edge_length);
	const vector<int>& strong_fragment_idxs = config->get_all_strong_fragment_type_idxs();
	const mass_t tolerance = config->get_tolerance();
	const mass_t pm_tolerance = (config->get_pm_tolerance() > tolerance) ? config->get_pm_tolerance() : tolerance * 0.75;
	const mass_t digest_tolerance = 1.5 * tolerance;
	const mass_t half_tolerance = 0.5 * tolerance;
	const int num_nodes  = nodes.size();
	const int num_combos = edge_combo_idxs.size();
	const mass_t max_combos_mass = aa_edge_combos[edge_combo_idxs[edge_combo_idxs.size()-1]].total_mass;;
	const mass_t max_node_mass = nodes[num_nodes-1].mass;
	

	int i;
	for (i=0; i<num_nodes; i++)
	{
		if (nodes[i].type == NODE_C_TERM)
			continue;

		const mass_t& node_mass = nodes[i].mass;
		const int length_minus_1 = edge_length-1;
		
		int current_e_idx = -1;
		int current_c_idx = -1;

		int last_node_idx=i;
		
		int c;
		for (c=0; c<num_combos; c++)
		{
			const int& combo_idx = edge_combo_idxs[c];
			const AA_combo& combo = aa_edge_combos[combo_idx];
			const mass_t& combo_mass = combo.total_mass;

			// should this combo be check - if there is a single edge with one of the
			// amino acids in the combo skip this combo
			bool is_overlap_edge = false;
			
			int a;
			for (a=0; a<edge_length; a++)
			{
				const int& aa = combo.amino_acids[a];
				if (out_aa_ind[aa][i])
					break;
			}

			if (a<edge_length)
			{
				if (! add_overlap_edges)
					continue;

				is_overlap_edge = true;
			}

			const mass_t exp_connect_mass = node_mass + combo_mass;
			const mass_t min_connect_mass1 = exp_connect_mass - half_tolerance;
			const mass_t max_connect_mass1 = exp_connect_mass + half_tolerance;

			int n_idx;
			score_t max_score = NEG_INF;
			int     max_idx   = -1;
			for (n_idx = last_node_idx+1; n_idx<num_nodes; n_idx++)
			{
				const mass_t& next_node_mass = nodes[n_idx].mass;
				
				if (next_node_mass<min_connect_mass1)
					continue;

				if (next_node_mass>max_connect_mass1)
					break;

				if (nodes[n_idx].score>max_score)
				{
					max_idx = n_idx;
					max_score = nodes[n_idx].score;
				}
			}

			// if couldn't connect with small tolerance, try larger margin
			if (max_idx<0)
			{
				const mass_t min_connect_mass2 = exp_connect_mass - digest_tolerance;
				const mass_t max_connect_mass2 = exp_connect_mass + digest_tolerance;

				int n_idx;
				for (n_idx = last_node_idx+1; n_idx<num_nodes; n_idx++)
				{
					const mass_t& next_node_mass = nodes[n_idx].mass;
					
					if (next_node_mass<min_connect_mass2)
					{
						last_node_idx++;
						continue;
					}

					if (next_node_mass>max_connect_mass2)
						break;

					if (nodes[n_idx].score>max_score)
					{
						max_idx = n_idx;
						max_score = nodes[n_idx].score;
					}
				}
			}


			if (max_score>NEG_INF)
			{
				// need to check if distance between peaks is within tolerance
				mass_t min_diff = fabs(nodes[max_idx].mass-exp_connect_mass);

				// try seeing if there are peaks that can bridge them
				if (min_diff>tolerance)
				{
					const vector<BreakageFragment>& fragments1 = nodes[i].breakage.fragments;
					const vector<BreakageFragment>& fragments2 = nodes[max_idx].breakage.fragments;

					int f;
					for (f=0; f<strong_fragment_idxs.size(); f++)
					{
						const int& strong_frag_idx = strong_fragment_idxs[f];
						const int pos1 = nodes[i].breakage.get_position_of_frag_idx(strong_frag_idx);
						if (pos1<0)
							continue;

						const int pos2 = nodes[max_idx].breakage.get_position_of_frag_idx(strong_frag_idx);
						if (pos2<0)
							continue;

						mass_t mass_diff = fabs(fragments1[pos1].mass - fragments2[pos2].mass);
						
						const int frag_charge = config->get_fragment(strong_frag_idx).charge;
						if (frag_charge>1)
							mass_diff *= frag_charge;

						mass_t diff = fabs(mass_diff - combo_mass);

						if (diff < min_diff)
							min_diff = diff;
					}
				}

				// larger tolerance is allowed for the digest and terminal nodes because they can be
				// created relative to the terminal node
				if (nodes[max_idx].type == NODE_DIGEST || nodes[i].type == NODE_DIGEST)
				{
					if (min_diff>digest_tolerance)
							continue;
				}
				else
				if (nodes[max_idx].type == NODE_C_TERM || nodes[i].type == NODE_N_TERM )
				{
					if (min_diff>digest_tolerance)
						continue;
				}
				else
					if (min_diff>tolerance)
						continue;

				// check if this edge overlaps a subpath of shorter edges with similar 
				// amino acids (already check the node i, so only check what comes in
				// node max_idx
				int a;
				for (a=0; a<edge_length; a++)
				{
					const int& aa = combo.amino_acids[a];
					if (in_aa_ind[aa][max_idx])
						break;
				}

				if (a<edge_length)
				{
					if (! add_overlap_edges)
						continue;

					is_overlap_edge = true;
				}
				
				// add combo idx
				if (current_c_idx != max_idx)
				{
					current_e_idx = find_edge_idx_ij(i,max_idx);
					current_c_idx = max_idx;
				}

				if (current_e_idx>=0 && multi_edges[current_e_idx].num_aa == edge_length)
				{
					MultiEdge& edge = multi_edges[current_e_idx];

					add_and_score_edge_variants(combo,edge);

					if (is_overlap_edge)
						edge.ind_edge_overlaps = true;
				}
				else // create new edge
				{
					MultiEdge new_edge;
		
					new_edge.n_idx = i;
					new_edge.c_idx = max_idx;
					new_edge.n_break = &nodes[i].breakage;
					new_edge.c_break = &nodes[max_idx].breakage;
					new_edge.num_aa= edge_length;
					new_edge.ind_edge_overlaps=is_overlap_edge;

					add_and_score_edge_variants(combo,new_edge);
					if (new_edge.variant_ptrs.size() == 0)
						continue;

					current_e_idx = multi_edges.size();
					nodes[i].out_edge_idxs.push_back(current_e_idx);
					nodes[max_idx].in_edge_idxs.push_back(current_e_idx);
					multi_edges.push_back(new_edge);
				}
			}
		}
	}
}




/***********************************************************
	assigns a value to each node's (log) rank field
	also sets max_node_score.
************************************************************/
void PrmGraph::rank_nodes_according_to_score()
{
	vector<score_pair> pairs;

	int i;
	pairs.resize(nodes.size());

	for (i=0; i<nodes.size(); i++)
	{
		pairs[i].idx=i;
		pairs[i].score = nodes[i].score;
	}

	sort(pairs.begin(),pairs.end());

	for (i=0; i<pairs.size(); i++)
		nodes[pairs[i].idx].log_rank = (float)log(2.0+i);

	max_node_score = NEG_INF;
	for (i=0; i<nodes.size(); i++)
		if (nodes[i].score>max_node_score)
			max_node_score = nodes[i].score;
}


/**********************************************************
	Sets the idx_max_in_score and idx_max_out_score fields
	for the nodes.
***********************************************************/
void PrmGraph::set_idxs_max_in_out_for_nodes()
{
	int i;
	for (i=0; i<nodes.size(); i++)
	{
		int e;
		score_t max_in_score = NEG_INF;
		nodes[i].idx_max_in_score_node=-1;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -