⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 prmgraph.h

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 H
字号:
#ifndef __PRMGRAPH_H__
#define __PRMGRAPH_H__

#include "Spectrum.h"
#include "Model.h"
#include "includes.h"


#define BONUS_FOR_COMPLETE_PEPTIDE 25.0


class PrmGraph;


struct SeqPath;

// This structure contains a path that can be expanded to many variants
struct MultiPath {

	MultiPath() : original_rank(-1), n_term_mass(-1), c_term_mass(-1),
		n_term_aa(N_TERM), c_term_aa(C_TERM), ind_compete_path(0), num_forbidden_nodes(0), path_score(0) {};

	bool check_if_correct(const Peptide& p, Config *config) const;

	int  get_num_correct_aas(const PrmGraph& prm, const Peptide& p, Config *config) const;

	int  get_num_aas() const;

	void print(Config *config, ostream& os=cout) const;

	int original_rank;
	mass_t n_term_mass;
	mass_t c_term_mass;

	int n_term_aa;
	int c_term_aa;

	int ind_compete_path;   // set to 1 if the path is from the N-terminal to the C-terminal

	int num_forbidden_nodes;

	score_t  path_score;

	vector<int> edge_idxs; // for the multi edges
	vector<int> node_idxs;
	vector<Breakage *> breakages;
};





// The structure for a single path corresponsing to one amino acid sequence
struct SeqPath {

	SeqPath() : multi_path_rank(POS_INF), n_term_mass(-1), c_term_mass(-1),
		n_term_aa(N_TERM), c_term_aa(C_TERM), pm_with_19(-1), charge(0),
		num_forbidden_nodes (0), path_score(NEG_INF), multi_path_score(NEG_INF), 
		org_rank(NEG_INF), delta_score(NEG_INF),
		rerank_score(NEG_INF), seq_prob(0), sort_key(0), prm_ptr(NULL), 
		tag_percent_top_5(0), tag_percent_top_20(0), tag_percent_all(0), 
		is_correct(false), is_cut_correct(false) {};


	void print(ostream &os = cout) const;

	void print_with_probs(ostream &os = cout) const;

	void print_full(Config *config, ostream &os = cout) const;

	void get_amino_acids(vector<int>& aas) const
	{
		int i;
		const int num_aa = positions.size()-1;

		if (num_aa<=0)
		{
			aas.clear();
			return;
		}

		aas.resize(num_aa);
		for (i=0; i<num_aa; i++)
			aas[i]=positions[i].aa;
	}

	void convert_QK()
	{
		int i;
		for (i=0; i<positions.size()-2; i++)
			if (positions[i].aa == Lys)
				positions[i].aa = Gln;

		if (c_term_mass<pm_with_19-30 && positions[i].aa == Lys)
			positions[i].aa = Gln;
	}


	// gives a penalty of digest score * # tolerance differences if the
	// peptide is complete and the mass is different form the one
	// used for the prm
	score_t adjust_complete_sequence_penalty(PrmGraph *prm);

	// calculates score according to the aa probabilties
	// gives premium if the path belonged to a denovo solution 
	void calc_confidence_score(int denovo_rank=-1);

	int get_num_correct_aas(const Peptide& pep, Config *config) const;

	mass_t calculate_peptide_mass(Config *config) const;

	
	bool check_if_correct(const string& str, Config *config) const;

	bool check_if_cut_correct(const vector<mass_t>& exp_cut_masses, mass_t tolerance) const;

	// returns the number of b/y ions
	int get_num_frags(const vector<int>& frag_idxs) const;


	void parse_path_to_smaller_ones(Config *config,
		int min_length, int max_length, vector<SeqPath>& new_paths) const;


	int get_num_aa() const { return positions.size()-1; }


	// adds the relevant PathPos to the path and adjusts the other non-terminal values 
	void add_edge_variant(const MultiEdge& edge, int e_idx, int varaint_idx);

	void make_seq_str(Config *config);

	vector<PathPos> positions; // there are n+1 positions for n amino acids

	int multi_path_rank;
		
	mass_t n_term_mass;
	mass_t c_term_mass;

	int n_term_aa;
	int c_term_aa;

	mass_t pm_with_19;
	int    charge;

	int num_forbidden_nodes;

	score_t path_score;

	score_t multi_path_score;

	int		org_rank;

	float   delta_score;

	float	rerank_score;
	
	float	seq_prob;

	float	sort_key; // according to the seq heap is sorted, higher value gets first position in heap

	PrmGraph *prm_ptr;

	string seq_str;

	float	tag_percent_top_5;
	float	tag_percent_top_20;
	float   tag_percent_all;

	bool    is_correct;
	bool	is_cut_correct;
};

bool comp_SeqPath_sort_key(const SeqPath& a, const SeqPath& b);
bool comp_SeqPath_path_score (const SeqPath& a, const SeqPath& b);



class PrmGraph {
	friend class DeNovoDp;
	friend class EdgeModel;
public:

	void create_graph_from_spectrum(Model *model, Spectrum *spectrum, 
					mass_t pm_with_19, int spec_charge=0, bool add_all_pepitde_nodes=false,
					bool only_basic_score = false);

	void create_graph_for_peptide_and_spectrum(Model *model, Spectrum *spectrum, 
					mass_t pm_with_19, int spec_charge, const Peptide& peptide);

	void extract_breakage_infos_for_score_training(Model *model,
												   int frag_idx,
												   int target_region_idx,
												   bool ind_strong_frag,
												   vector<BreakageInfo>& good_examples,
												   vector<BreakageInfo>& bad_examples) const;


	// removes all edges to and from nodes with the active flag set to 0
	void remove_edges_from_inactive_nodes();

	// finds the highest scoring continuous subpath for a given peptide in the graph
	SeqPath get_highest_scoring_subpath(const Peptide& peptide, mass_t start_mass =0) const;

	SeqPath get_longest_subpath(const Peptide& peptide, mass_t start_mass,
						bool verbose = false);

	SeqPath get_path_from_peptide_prm_graph(const Peptide& peptide) const;


	int get_num_nodes() const { return nodes.size(); }
	Spectrum *get_source_spectrum() const { return source_spectrum; }
	Config   *get_config() const { return config; }
	Model    *get_model()  const { return model;  }


	
	void expand_multi_path(const MultiPath& multi_path, 
		                   vector<SeqPath>& seq_paths,
						   score_t min_score,
						   score_t forbidden_pair_penalty,
						   int max_num_paths = -1) const;

	void fast_expand_multi_path(const MultiPath& multi_path, 
		                   vector<SeqPath>& seq_paths,
						   score_t min_score,
						   score_t forbidden_pair_penalty,
						   int max_num_paths = -1) const;

	void fast_expand_multi_path_for_combo_scores(
						   Model *model,
						   const MultiPath& multi_path, 
		                   vector<SeqPath>& seq_paths,
						   score_t min_score,
						   score_t forbidden_pair_penalty,
						   int max_num_paths = -1);

	void parse_seq_path_to_smaller_ones(const SeqPath& org_path, 
										  int min_length, 
										  int max_length, 
										  vector<SeqPath>& new_paths);

	void expand_all_multi_paths(Model *model, const vector<MultiPath>& multi_paths, 
			vector<SeqPath>& paths, score_t forbidden_pair_penalty = 25, int max_num_paths = 100);

	void get_all_correct_node_idxs(const Peptide& peptide, vector<int>& idxs) const;

	void get_all_mirror_node_idxs(const Peptide& peptide, vector<int>& idxs) const;

	// returns the idxs of nodes correponding to the expected breakages of the peptide
	void get_relevant_node_idxs(const Peptide& peptide, vector<int>& idxs) const;

	void fill_breakage_info(const Model *model, BreakageInfo *info, int node_idx,
								  int n_edge_idx, int n_variant_idx,
								  int c_edge_idx, int c_variant_idx, int type =0) const;


	const MultiEdge& get_multi_edge(int e_idx) const { return multi_edges[e_idx]; }
	const Node& get_node(int n_idx) const { return nodes[n_idx]; }
	Node& get_non_const_node(int n_idx) { return nodes[n_idx]; }
	const vector<MultiEdge>& get_multi_edges() const { return multi_edges; }
	const vector<Node>& get_nodes() const { return nodes; }
	const vector<int>& get_forbidden_idxs() const { return forbidden_idxs; }
	void set_frobidden_idxs(const vector<int>& idxs) { forbidden_idxs = idxs; }
	

	mass_t get_max_node_mass() const { return nodes[nodes.size()-1].mass; }

	void   set_significant_masses();

	mass_t get_min_significant_mass() const { return min_significant_mass; }
	mass_t get_max_significant_mass() const { return max_significant_mass; }
	int	   get_min_significant_idx() const { return min_significant_idx; }
	int    get_max_significant_idx() const { return max_significant_idx; }

	score_t    get_max_node_score() const { return max_node_score; }

	bool get_has_node_combo_scores() const { return has_node_combo_scores; }
	void set_has_node_combo_scores(bool b) { has_node_combo_scores = b; }

	// for quick access to nodes array
	PeakRange get_nodes_in_range(mass_t min_r, mass_t max_r) const;
	int get_max_score_node(mass_t exp_mass, mass_t tolerance) const;
	int get_min_dis_node(mass_t mass, mass_t tolerance) const;

	mass_t find_optimal_pm_with_19_for_graph(Model *_model, 
				Spectrum *spectrum, mass_t base_pm_with_19, int spec_charge);

	void print(ostream& os = cout, bool print_edge_scores = false) const;

	void print_only_scores() const;

	void add_edge_scores_to_nodes(); 

	void clear() { nodes.clear(); multi_edges.clear(); index_array.clear(); has_node_combo_scores=false;
				out_aa_ind.clear(); in_aa_ind.clear(); forbidden_idxs.clear(); n_digest_node_idxs.clear();
				c_digest_node_idxs.clear(); source_spectrum=NULL; config=NULL; model=NULL; }

	void print_with_multi_edges() const;

	void print_with_combo_tables() const;

	void print_edge_label(int edge_idx, int var_idx, ostream& os = cout) const;

	int	get_charge() const { return charge; }
	int get_size_idx() const { return size_idx; }
	mass_t get_pm_with_19() const { return pm_with_19; }



	

private:
	vector<Node> nodes;
	vector<MultiEdge> multi_edges;

	vector<int>  index_array;

	Spectrum * source_spectrum;
	Config   * config;
	Model    * model;

	mass_t   pm_with_19; // the pm_with_19 used to create the graph
	int      size_idx;   // 
	int      charge;

	mass_t min_significant_mass;  // the mass of the highest scoring node in the below 10% mass range
	mass_t max_significant_mass;  // the mass of the highest scoring node in the above 90% mass range

	int    min_significant_idx, max_significant_idx;

	score_t max_node_score;

	score_t digest_node_score;

	bool has_node_combo_scores;


	vector<int> forbidden_idxs;

	vector<int> n_digest_node_idxs, c_digest_node_idxs;
	
	vector< vector< bool > > out_aa_ind, in_aa_ind; // holds for each amino acid and node
												 // whether there is a single aa leaving that node


	// functions
	void init_index_array();
	void merge_close_nodes();
	void create_nodes();
	void add_digest_nodes(mass_t n_digest_mass=NEG_INF, mass_t c_digest_mass=NEG_INF);
	void score_nodes(Model *model);

	void create_nodes_for_peptide(const Peptide& pep);
	

	// this function performs all the scoring operations on edges 
	// (amino acid scores, missing cleavage scores etc.)
	score_t calc_edge_variant_score(const MultiEdge& egde, int num_aa, int *aa) const;

	// add the variant ptr and scores for this combo
	void add_and_score_edge_variants(const AA_combo& combo, MultiEdge& edge);


	// fill edges
	void fill_single_multi_edges();
	
	void fill_double_multi_edges(bool add_overlap_edges=false);

	void fill_longer_multi_edges(int max_edge_length, bool add_overlap_edges=false);

	// assigns a value to each node's rank field
	void rank_nodes_according_to_score();

	void set_idxs_max_in_out_for_nodes();

	void fill_forbidden_idx_indicators();


	// finds the edge idx of the edge from node i to j, returns -1 if no such node exists
	int find_edge_idx_ij(int i_idx, int j_idx) const
	{
		const vector<int>& out_idxs = nodes[i_idx].out_edge_idxs;
		int e;
		for (e=out_idxs.size()-1; e>=0; e--)
			if (multi_edges[out_idxs[e]].c_idx == j_idx)
				return out_idxs[e];
		return -1;
	}



	// sorts edges according to the value to which they lead
	void sort_outgoing_edges();

	void sort_outgoing_edges_according_to_max_gains(const vector< vector< score_t > >& max_gains);

	void get_node_ordering_according_to_max_gains(
	 vector< vector< score_t > >& max_gains_for_length,
	 vector<int>& node_order) const;


	// creates a path object from a collection of edges that are assumed
	// to correspond to a path in the graph
	void create_path_from_edges(vector<int>& edge_idxs, MultiPath& path) const;


	void print_multi_edges(int node_idx, bool print_edge_scores = false) const;
};


// more or less the same functions as the map with the spectrum
// why not use template? good question...
// this seems easier...

/*****************************************************************
Returns the indices of all peaks that are within the mass range
******************************************************************/
inline PeakRange  PrmGraph::get_nodes_in_range(mass_t min_r, mass_t max_r) const
{
	PeakRange pr;

	int max_node_idx = nodes.size()-1;

	if (max_r>= pm_with_19-2)
		max_r = pm_with_19-2;

	if (min_r<0)
		min_r=0;

	if (max_r<min_r)
		return pr;

	int i_min=index_array[(int)min_r];
	int i_max=index_array[((int)max_r)];
		
	if (i_max<max_node_idx)
		i_max++;

	while (nodes[i_min].mass<min_r && i_min<i_max)
		i_min++;

	while (nodes[i_max].mass>max_r && i_max>0)
		i_max--;

	if (nodes[i_min].mass > max_r || nodes[i_max].mass<min_r)
		return pr;

	pr.num_peaks = i_max - i_min+1;
	pr.low_idx =   i_min;
	pr.high_idx =  i_max;

	return pr;
}


/***********************************************************
returns the idx of the peak that is closes to the mass
(-1 is returned if no peak is found)
************************************************************/
inline int PrmGraph::get_min_dis_node(mass_t mass, mass_t tolerance) const
{
	PeakRange pr = get_nodes_in_range(mass-tolerance,mass+tolerance);
	if (pr.num_peaks==0)
		return -1;
	mass_t min_dis=fabs(mass - nodes[pr.low_idx].mass);
	int min_idx=pr.low_idx;
	int j;

	// find closest peak to exp_pos
	for (j=1; j<pr.num_peaks; j++)
	{
		mass_t dis = fabs(mass - nodes[pr.low_idx+j].mass);
		if (dis<min_dis)
		{
			min_dis=dis;
			min_idx = pr.low_idx+j;
		}
	}
	return min_idx;
}

/***********************************************************
returns the idx of the peak that has the highest intensity
in the range (-1 is returned if no peak is found)
Balances between intensity and proximity to the expected position
A peak at the edge needs to be at least 2 times stronger than
a peak exactly at the middle to be chosen.
************************************************************/
inline int PrmGraph::get_max_score_node(mass_t exp_mass, mass_t tolerance) const
{
	PeakRange pr = get_nodes_in_range(exp_mass-tolerance,exp_mass+tolerance);
	if (pr.num_peaks==0)
		return -1;

	if (pr.num_peaks ==1)
		return pr.low_idx;

	mass_t max_val = (2*tolerance - fabs(nodes[pr.low_idx].mass - exp_mass)) *
								nodes[pr.low_idx].score;

	int max_idx=pr.low_idx;
	int j;
	// find closest peak to exp_pos
	for (j=1; j<pr.num_peaks; j++)
	{
		mass_t peak_val = (2*tolerance - fabs(nodes[pr.low_idx+j].mass - exp_mass)) *
								nodes[pr.low_idx+j].score;
		if (peak_val>max_val)
		{
			max_val=peak_val;
			max_idx = pr.low_idx+j;
		}
		else
			break;
	}
	return max_idx;
}




#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -