⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovosolutions.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
#include "DeNovoSolutions.h"
#include "DeNovoRankScore.h"
#include "auxfun.h"


mass_t SeqPathKey::tolerance;

int num_messages=0;

/**************************************************************************
	Adds a new seqpath to the existing vector.
	If vector is at maximal size, can replace the lowest confidence path.
	If a path exists with the same seq and n,c masses but lower confidence
	score, replaces it.

  return values:
	0 - score not high enough to be intered in heap
	1 - solution already exists with higher score
	2 - added path with no need to remove something in the heap
	3 - added path but removed a different one from the heap
***************************************************************************/
int SeqPathHeap::add_path(SeqPath& new_path, bool verbose)
{
	if (new_path.sort_key<min_value && paths.size() >= max_size)
		return 0;

	SeqPathKey new_key(new_path);
	set<SeqPathKey>::iterator it=path_keys.find(new_key);
	if (it != path_keys.end())
	{
		if (verbose)
		{
			cout << "Same: " << new_path.seq_str << " " << new_path.n_term_mass << " " << new_path.pm_with_19 << " " << new_path.path_score << " " << new_path.seq_prob << endl;
			cout << "Same: " << it->pep_str << " " << it->n_mass << " " << it->sort_key << endl;
		}

		if (it->sort_key < new_path.sort_key)
		{
			if (verbose)
			{
				cout << "Replaced " << setprecision(6) << it->sort_key << " with " << new_path.sort_key << endl;
				cout << it->n_mass << " : " << new_path.n_term_mass << endl;
				cout << it->pep_str << " : " << new_path.seq_str << endl;
			}
			new_key.path_pos_idx = it->path_pos_idx;
			paths[it->path_pos_idx]=new_path;
			path_keys.erase(it);
			path_keys.insert(new_key);
		}

		// no update of the heap value
		return 1;
	}

	// add path, no need to remove
	if (paths.size() < max_size)
	{
		new_key.path_pos_idx = paths.size();
		paths.push_back(new_path);
		path_keys.insert(new_key);

		min_score_heap.push_back(idx_score_pair(new_key));
		push_heap(min_score_heap.begin(),min_score_heap.end());

		min_idx = min_score_heap[0].path_pos_idx;
		min_value = min_score_heap[0].sort_key;
		return 2;
	}

	// remove lowest score and then add
	new_key.path_pos_idx = min_idx;
	SeqPathKey remove_key(paths[min_idx]);
	it=path_keys.find(remove_key);
	if (it == path_keys.end())
	{
	//	if (num_messages++<10)
	//		cout << "Warning: couldn't find remove key!" << endl;
	//	return;
	}

	paths[min_idx] = new_path;

	if (it != path_keys.end())
		path_keys.erase(it);

	path_keys.insert(new_key);

	pop_heap(min_score_heap.begin(),min_score_heap.end());
	min_score_heap.pop_back();

	min_score_heap.push_back(idx_score_pair(new_key));
	push_heap(min_score_heap.begin(),min_score_heap.end());

	min_idx = min_score_heap[0].path_pos_idx;
	min_value = min_score_heap[0].sort_key;

	return 3;
}



void print_prm_graph_scores(Model *model, Spectrum *spec, 
					 mass_t pm_with_19, int charge)
{
	PrmGraph prm;
	Config *config= model->get_config();

	if (charge> model->get_max_score_model_charge() || charge<1)
	{
		cout << "Charge " << charge << " not supported..." << endl;
		return;
	}


	

	spec->set_charge(charge);

	model->init_model_for_scoring_spectrum(spec);

	mass_t opt_pm_with_19 = prm.find_optimal_pm_with_19_for_graph(model,spec,pm_with_19,charge);

	spec->set_corrected_pm_with_19(opt_pm_with_19);

	if (! strcmp(model->get_model_name().c_str(),"ETD"))
	{
		vector<string> labels;
		labels.push_back("c");
		labels.push_back("z");
		spec->print_expected_fragment_peaks(labels,cout);	
	}
	else
		spec->print_expected_by();

	prm.create_graph_from_spectrum(model,spec,opt_pm_with_19);

	model->score_graph_edges(prm);

	prm.add_edge_scores_to_nodes();

	prm.print_only_scores();

	cout << endl;
}


/**************************************************************************
	Wrapper funciton that generates the desired solutions.
	Combines both local and global solutions (similar to the PepNovoTag
	and LocalTag solutions).
***************************************************************************/
bool generate_denovo_solutions(PrmGraph*& prm,
							   Model *model, 
							   Spectrum *spec,
							   bool denovo_mode,
							   mass_t pm_with_19,
							   int charge,
							   int num_sols, 
							   int min_length, 
							   int max_length,
							   score_t min_score_needed,
							   vector<SeqPath>& solutions,
							   bool only_complete,
							   bool only_from_graph_containing_true_pep,
							   bool need_to_create_PrmGraph)
{
	DeNovoDp dndp;
	Config *config= model->get_config();
	const score_t forbidden_pair_penalty = config->get_forbidden_pair_penalty();
	

	// shorten the generated sequences to improve the accuracy
	if (max_length>10)
	{
		if (pm_with_19>1800)
		{
			if (min_length>=10)
			{
				min_length=8;
				max_length=14;
			}
			else
				max_length=14;
		}

		if (pm_with_19>2200)
		{
			if (min_length>=10)
			{
				min_length=8;
				max_length=12;
			}
			else
				max_length=12;
		}
	}



	if (! prm)
		prm = new PrmGraph;

	if (need_to_create_PrmGraph)
	{
		spec->set_charge(charge);
		model->init_model_for_scoring_spectrum(spec);
		prm->create_graph_from_spectrum(model,spec,pm_with_19,charge);
		model->score_graph_edges(*prm);
	}

//	prm->print_with_multi_edges();
//	exit(0);

	if (only_from_graph_containing_true_pep)
	{
		const Peptide& true_pep = spec->get_peptide();
		if (true_pep.get_num_aas()<3)
		{
			cout << "Error: looking for graphs containing peptide path, but no peptide was given!" << endl;
			exit(1);
		}

		SeqPath best_path = prm->get_longest_subpath(true_pep,0);
		if (best_path.get_num_aa() < true_pep.get_num_aas())
			return false;
	}

	dndp.fill_dp_table(prm,config->get_forbidden_pair_penalty());
	SeqPath best = prm->get_longest_subpath(spec->get_peptide(),0);
//	best.print_full(config);



	if (1)
	{
		SeqPathHeap denovo_path_heap;
		vector<SeqPath> seq_paths;
		vector<MultiPath> multi_paths;

		//	generate global tags from parsing de novo paths
		denovo_path_heap.init(num_sols,config->get_tolerance());

		int num_multi_paths=num_sols;
		if (num_sols<2000)
			num_multi_paths=2000;

		dndp.get_top_scoring_antisymetric_paths_with_length_limits(multi_paths, 
			num_multi_paths, min_length , max_length, forbidden_pair_penalty, min_score_needed, 
			denovo_mode, only_complete);
	

		prm->expand_all_multi_paths(model, multi_paths, seq_paths, forbidden_pair_penalty, num_sols);

		int i;
		for (i=0; i<seq_paths.size(); i++)
		{
			seq_paths[i].pm_with_19 = pm_with_19;
			seq_paths[i].charge = charge;
			seq_paths[i].make_seq_str(config);
			seq_paths[i].sort_key = seq_paths[i].path_score;

			// if peptide is whole, make sure its mass is within the pm tolerance
			if (seq_paths[i].n_term_mass<1.0 && seq_paths[i].c_term_mass + 30.0 > seq_paths[i].pm_with_19)
			{
				const mass_t pep_mass = seq_paths[i].calculate_peptide_mass(config) + MASS_OHHH;
				const mass_t delta = fabs(pep_mass - spec->get_org_pm_with_19());
			//	cout << i << "\t" << pep_mass << "\t" << spec->get_org_pm_with_19() << "\t" << delta << endl;
			//	if (delta > config->get_pm_tolerance())
			//		continue;
			}
			denovo_path_heap.add_path(seq_paths[i]);
		}

		sort(denovo_path_heap.paths.begin(),denovo_path_heap.paths.end(),comp_SeqPath_path_score);

		solutions = denovo_path_heap.paths;
		for (i=0; i<solutions.size(); i++)
		{
			solutions[i].delta_score = solutions[0].path_score - solutions[i].path_score;
			solutions[i].make_seq_str(config);
		}
	}

	

//	cout << "SOLS: " << solutions.size() << " " << min_score_needed << endl;
	return true;
}

/**************************************************************************
	Wrapper funciton that generates the desired solutions.
	Combines both local and global solutions (similar to the PepNovoTag
	and LocalTag solutions).
***************************************************************************/
bool generate_denovo_solutions_with_good_start_end_idxs(
							   PrmGraph*& prm,
							   Model *model, 
							   Spectrum *spec,
							   bool denovo_mode,
							   mass_t pm_with_19,
							   int  charge,
							   int num_sols, 
							   int min_length, 
							   int max_length,
							   vector<SeqPath>& solutions)
{
	DeNovoDp dndp;
	
	Config *config= model->get_config();
	const score_t forbidden_pair_penalty = config->get_forbidden_pair_penalty();
	
	if (charge>3 || charge<1)
	{
		solutions.clear();
		return false;
	}

	

	if (! prm)
		prm = new PrmGraph;

	spec->set_charge(charge);
	model->init_model_for_scoring_spectrum(spec);
	prm->create_graph_from_spectrum(model,spec,pm_with_19);
	model->score_graph_edges(*prm);

	dndp.fill_dp_table(prm,forbidden_pair_penalty);

	// find which nodes are good start/end points
	const vector<Node>& nodes = prm->get_nodes();
	vector<bool> good_idx_inds;

	good_idx_inds.resize(nodes.size(),false);
	int m_idx=0,n_idx=0;
	vector<mass_t> good_masses;
	spec->get_peptide().calc_expected_breakage_masses(config,good_masses);
	const int num_nodes = nodes.size(), num_masses = good_masses.size();

	good_idx_inds[0]=true;
	good_idx_inds[num_nodes-1]=true;
	while (n_idx<num_nodes && m_idx<num_masses)
	{
		const mass_t m_mass = good_masses[m_idx];
		const mass_t n_mass = nodes[n_idx].mass;

		if (fabs(m_mass-n_mass)<1.25)
			good_idx_inds[n_idx]=true;

		if (m_mass<n_mass-2)
		{
			m_idx++;
		}
		else
			n_idx++;
	}

	SeqPathHeap denovo_path_heap;
	vector<MultiPath> multi_paths;

	//	generate global tags from parsing de novo paths
	denovo_path_heap.init(num_sols,config->get_tolerance());

	dndp.get_top_scoring_antisymetric_paths_with_specified_start_end_idxs(
		good_idx_inds, multi_paths, num_sols, min_length, max_length, config->get_forbidden_pair_penalty());

	vector<SeqPath> seq_paths;
	prm->expand_all_multi_paths(model, multi_paths,seq_paths, forbidden_pair_penalty, num_sols);

	// parse seq paths

	int i;
	for (i=0; i<seq_paths.size(); i++)
	{
		seq_paths[i].sort_key = seq_paths[i].path_score;
		denovo_path_heap.add_path(seq_paths[i]);
	}
	
	sort(denovo_path_heap.paths.begin(),denovo_path_heap.paths.end(),comp_SeqPath_path_score);

	solutions = denovo_path_heap.paths;
	
	return true;
}





/***************************************************************************
	Wrapper function that generates several solutions according to different 
	precursor masses.
****************************************************************************/
void generate_denovo_solutions_from_several_pms(vector<PrmGraph *>& prm_ptrs,
												AdvancedScoreModel *model,
												Spectrum *spec,
												bool ind_denovo_mode,
												int num_sols, 
												int min_length, 
												int max_length,
												vector<mass_t>&  different_pms_with_19,
												vector<int>& charges,
												vector<SeqPath>& best_solutions,
												bool ind_only_complete)
{
	static SeqPathHeap seq_heap;
	best_solutions.clear();

	seq_heap.init(num_sols, model->get_config()->get_tolerance());

	int i;
	for (i=0; i<different_pms_with_19.size(); i++)
	{
		const score_t min_score_needed = ( (! ind_denovo_mode || i==0 || seq_heap.min_score_heap.size()<num_sols) ?
									 NEG_INF :  seq_heap.min_value );
	
		vector<SeqPath> sols;
		generate_denovo_solutions(prm_ptrs[i], model,spec, ind_denovo_mode,
			different_pms_with_19[i],charges[i],num_sols,min_length, max_length, 
			min_score_needed, sols, ind_only_complete);

		// removes duplicated
		int j;
		for (j=0; j<sols.size(); j++)
		{
			seq_heap.add_path(sols[j]);
		}

		// impose time limit
	}

	best_solutions = seq_heap.paths;


	if (ind_denovo_mode)
	{
		sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_path_score);
	}
	else
		sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_sort_key);

	for (i=0; i<best_solutions.size(); i++)
		best_solutions[i].org_rank=i;
}

void generate_denovo_solutions_from_several_pms_with_good_start_end_idxs(
							   vector<PrmGraph *>& prm_ptrs,
							   AdvancedScoreModel *model,
							   Spectrum *spec,
							   bool ind_denovo_mode,
							   int num_sols, int min_length, int max_length,
							   vector<mass_t>& different_pms_with_19,
							   vector<int>& charges,
							   vector<SeqPath>& best_solutions)
{
	static SeqPathHeap seq_heap;
	best_solutions.clear();

	seq_heap.init(num_sols, model->get_config()->get_tolerance());

	int i;
	for (i=0; i<different_pms_with_19.size(); i++)
	{
		vector<SeqPath> sols;
		generate_denovo_solutions_with_good_start_end_idxs(prm_ptrs[i], model,spec, ind_denovo_mode,
			different_pms_with_19[i],charges[i],num_sols,min_length,max_length,sols);

		// removes duplicated
		int j;
		for (j=0; j<sols.size(); j++)
			seq_heap.add_path(sols[j]);
	}

	best_solutions = seq_heap.paths;

	if (ind_denovo_mode)
	{
		sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_path_score);
	}
	else
		sort(best_solutions.begin(),best_solutions.end(),comp_SeqPath_sort_key);

	for (i=0; i<best_solutions.size(); i++)
		best_solutions[i].org_rank=i;
}


/***************************************************************************

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -