⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovosolutions.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
Tests each tag to see how many de novo paths it is part of.
Sets the variablesi: multi_path_rank (first rank),percent5, percent20, percent_all
in each tag.
****************************************************************************/
void determine_tag_containment_ratios(vector<SeqPath>& tags, 
									  const vector<SeqPath>& paths,
									  const vector<score_pair>& denovo_scores,
									  int start_idx)
{

	int i;
	for (i=start_idx; i<tags.size(); i++)
	{
		int num_5=0, num_20=0, num_all=0, first=POS_INF;
		const vector<PathPos>& tag_positions = tags[i].positions;
		const int first_node_idx = tag_positions[0].node_idx;

	/*	cout << "Tag: ";
		int q;
		for (q=0; q<tag_positions.size(); q++)
			cout << tag_positions[q].node_idx << " ";
		cout << endl;*/

		int j;
		for (j=0; j<denovo_scores.size(); j++)
		{
			const SeqPath& denovo_path = paths[denovo_scores[j].idx];
			const vector<PathPos>& path_positions = denovo_path.positions;
			const int last_pos = path_positions.size()-1;
		
	/*		cout << "Path " << j << " : ";
			int q;
			for (q=0; q<path_positions.size(); q++)
				cout << path_positions[q].node_idx << " ";
			cout << endl;*/

			
			int k=0;
			while (k<last_pos && path_positions[k].node_idx<first_node_idx)
				k++;

			if (path_positions[k].node_idx != first_node_idx)
				continue;

			int a;
			for (a=1; a<tag_positions.size() && a+k<path_positions.size(); a++)
				if (tag_positions[a].node_idx != path_positions[a+k].node_idx)
					break;

			if (a<tag_positions.size())
				continue;

			if (j<first)
				first=j;

			num_all++;
			if (j<20)
				num_20++;
			if (j<5)
				num_5++;
		}

		tags[i].multi_path_rank=first;
		if (paths.size()>0)
		{
			tags[i].tag_percent_top_5  = num_5*0.2;
			tags[i].tag_percent_top_20 = num_20*0.05;
			tags[i].tag_percent_all	   = num_all/(float)paths.size();
		}

	//	cout << i << "\t" << tags[i].seq_str << "\t" << tags[i].multi_path_rank <<
	//		"\t" << tags[i].tag_percent_top_5 << "\t" <<
	//		tags[i].tag_percent_top_20 << "\t" << setprecision(4) << tags[i].tag_percent_all << endl;
	}
//	exit(0);
}


/*************************************************************************
Generates tags by making a mixture of local/de novo tags
checks which tags appear in the longer denovo sequences sets the boolean
indicators in the tag seq paths
**************************************************************************/
void generate_tags(vector<PrmGraph *>& prm_ptrs,
				   AdvancedScoreModel *model,
				   BasicSpectrum& bs,
				   Spectrum *spec,
				   const vector<int>& final_num_tags,
				   int main_tag_length,				 // the length for which we parse de novo sequences
				   const vector<mass_t>& pms_with_19,
				   const vector<int>& charges,
				   vector<SeqPath>& final_tags,
				   bool use_original_num_tags,
				   int  prm_ptr_start_idx)
{
	const int max_tag_length=10;
	const double search_time_limit = 7.0;
	const int denovo_seq_heap_size = 50;
	static SeqPathHeap denovo_seq_heap;
	static vector<SeqPathHeap> tag_heaps;

	Config *config = model->get_config();
	const mass_t tolerance = config->get_tolerance();
	const int charge   = charges[0];
	const int size_idx = config->calc_size_idx(charge,pms_with_19[0]);
	
	int i;

	int min_denovo_length=6;
	for (i=0; i<final_num_tags.size(); i++)
		if (final_num_tags[i]>0)
		{
			min_denovo_length=i;
			break;
		}
	if (min_denovo_length<6)
		min_denovo_length=6;

	final_tags.clear();

	// init models, num tags, etc.
	// don't use de novo where it is time consuming and not accurate
	int num_denovo_solutions = denovo_seq_heap_size;
	bool perform_denovo_rerank=false;
	bool perform_denovo=true;
	if (charge==2 && pms_with_19[0]>=2400 || 
		charge>2 && pms_with_19[0]>2450)
	{
		perform_denovo=false;
		num_denovo_solutions=0;
	}

	DeNovoRankScorer * denovo_rank_model = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
/*	if (0 && perform_denovo && denovo_rank_model && denovo_rank_model->get_ind_part_model_was_initialized(charge,size_idx))
	{
		perform_denovo_rerank=true;
		num_denovo_solutions = denovo_seq_heap_size * 3;
	}*/

	vector<bool> perform_tag_reranks;
	vector<int> num_tags;
	vector<DeNovoRankScorer *> tag_rank_models;

	perform_tag_reranks.resize(max_tag_length,false);
	num_tags.resize(max_tag_length,0);
	tag_rank_models.resize(max_tag_length,NULL);

	int tag_round=0;
	int tag_length;
	for (tag_length=0; tag_length<final_num_tags.size() && tag_length<max_tag_length; tag_length++)
	{
		if (final_num_tags[tag_length]<=0)
			continue;
	
		num_tags[tag_length]=final_num_tags[tag_length];
		tag_rank_models[tag_length] = (DeNovoRankScorer *)model->get_rank_tag_model_ptr(tag_length);
		if (tag_rank_models[tag_length] && tag_rank_models[tag_length]->get_ind_part_model_was_initialized(charge,size_idx))
		{	
			perform_tag_reranks[tag_length]=true;
			if (! use_original_num_tags)
			{
				num_tags[tag_length] *= (4+2*tag_round);	// add more to larger lengths since they are covered by shorter tags
				num_tags[tag_length] += 10;
			}
		}
		tag_round++;						
	}

	denovo_seq_heap.init(denovo_seq_heap_size, tolerance);
	tag_heaps.resize(max_tag_length);
	for (tag_length=0; tag_length<max_tag_length; tag_length++)
		if (num_tags[tag_length]>0)
			tag_heaps[tag_length].init(num_tags[tag_length],tolerance);
	
	const clock_t start_t = clock();

	// collect de novo sequences and generate local tag
	for (i=0; i<pms_with_19.size(); i++)
	{
		// ignore differnet charges, should not be here
		if (charges[i] != charges[0])
			continue;

		const clock_t end_t = clock();
		const double run_time = (end_t - start_t)/(double)CLOCKS_PER_SEC;

		// limit the time spent here if searches run too long
		if (i>0 && run_time>search_time_limit)
		{
			break;
		}

		// First generate de novo solutions 
		const score_t min_seq_score_needed = ( (i==0 || denovo_seq_heap.min_score_heap.size()<num_denovo_solutions) ?
											 NEG_INF :  denovo_seq_heap.min_value );

		int max_length = 13;
		if (pms_with_19[i]>1800)
			max_length = 10;
		if (charges[i]>2 || pms_with_19[i]>2200)
			max_length = 9;

		if (perform_denovo)
		{
			vector<SeqPath> denovo_sols;


			generate_denovo_solutions(prm_ptrs[i+prm_ptr_start_idx],  model, spec, true,
				pms_with_19[i], charges[i], num_denovo_solutions, min_denovo_length, max_length,
				min_seq_score_needed, denovo_sols, false);

			if (i==0 && bs.ssf->peptide.get_num_aas()>2)
			{
			//	cout << endl;
			//	SeqPath best = prm_ptrs[0+prm_ptr_start_idx]->get_longest_subpath(bs.ssf->peptide,0);
			//	spec->print_expected_by();
			//	best.print_full(config);
			//	prm_ptrs[0]->print();
			//	exit(0);

			}

			// just remove duplicates
			int j;
			for (j=0; j<denovo_sols.size(); j++)
				denovo_seq_heap.add_path(denovo_sols[j]);
		}
		
		// generate local tags
		int tag_length;
		for (tag_length=0; tag_length<max_tag_length; tag_length++)
		{
			if (num_tags[tag_length]>0)
			{
				const score_t min_tag_score_needed = ( (i==0 || tag_heaps[tag_length].min_score_heap.size()<num_tags[tag_length]) ?
													   NEG_INF :  tag_heaps[tag_length].min_value );		
				vector<SeqPath> tag_sols;
				generate_denovo_solutions(prm_ptrs[i+prm_ptr_start_idx],  model, spec, false,
					pms_with_19[i], charges[i], num_tags[tag_length], tag_length, tag_length, 
					min_tag_score_needed, tag_sols, false, false, (!perform_denovo));

				int j;
				for (j=0; j<tag_sols.size(); j++)
				{
					tag_heaps[tag_length].add_path(tag_sols[j]);
					if (tag_sols[j].get_num_aa() != tag_length)
					{
						cout << "problem1  " << j << " " << tag_sols[j].get_num_aa() << " : " << tag_length <<endl;
					}
				}
			}
		}
	}

	// sort de novo seqs
	vector<score_pair> denovo_scores;
	denovo_scores.clear();
	if (perform_denovo)
	{
		vector<SeqPath>& denovo_seqs = denovo_seq_heap.paths;
		if (0) // don't rerank sequences
		{
			denovo_rank_model->score_denovo_sequences(denovo_seqs,bs.ssf,bs.peaks,bs.num_peaks,denovo_scores,size_idx);
			sort(denovo_scores.begin(),denovo_scores.end());
		}
		else
		{
			int i;
			denovo_scores.resize(denovo_seqs.size());
			for (i=0; i<denovo_scores.size(); i++)
			{
				denovo_scores[i].idx=i;
				denovo_scores[i].score=denovo_seqs[i].path_score;
			}
			sort(denovo_scores.begin(),denovo_scores.end());
		}

		// parse de novo paths into tags and add them to the tag heap
		if (num_tags[main_tag_length]>0)
		{
			int max_num_to_force = num_tags[main_tag_length]/4;
			if (max_num_to_force>15)
				max_num_to_force=15;

			int num_forced=0;
			int i;
			for (i=0; i<denovo_scores.size(); i++)
			{
				SeqPath& big_path = denovo_seqs[denovo_scores[i].idx];
				vector<SeqPath> parsed_tags;
				
				PrmGraph *prm_ptr = big_path.prm_ptr;
				big_path.multi_path_rank = i;
				prm_ptr->parse_seq_path_to_smaller_ones(big_path, main_tag_length, main_tag_length, parsed_tags);

				int j;
				for (j=0; j<parsed_tags.size(); j++)
				{
					int add_return_value = tag_heaps[main_tag_length].add_path(parsed_tags[j]);

					// if the tag came from one of the top 5 sequences and it was rejected
					// because of a low score, we'll force it in by adding a high score +10000
					// that will later be removed
					if (add_return_value == 0 && 
						num_forced<max_num_to_force && 
						parsed_tags[j].multi_path_rank<5)
					{
						parsed_tags[j].sort_key += 10000.0;
						num_forced++;

					//	int ret_val = tag_heaps[main_tag_idx].add_path(parsed_tags[j]);
					//	cout << ret_val << " " << num_forced << " Added " << parsed_tags[j].multi_path_rank << "\t" << 
					//		parsed_tags[j].seq_str << "\t" << parsed_tags[j].path_score << "\t"
					//		<< parsed_tags[j].sort_key <<"\t" << tag_heaps[main_tag_idx].min_value << endl;
					}

					if (parsed_tags[j].get_num_aa() != main_tag_length)
					{
						cout << "Problem2 " << j << " " << main_tag_length << " : " << parsed_tags[j].get_num_aa() << endl;
						exit(1);
					}
				}
			}

			// since the heap will not be used as a heap anymore (it is now just a container)
			// we can go back and remove the bonus scores that were added
			for (i=0; i<tag_heaps[main_tag_length].paths.size(); i++)
			{
				if (tag_heaps[main_tag_length].paths[i].sort_key>8000.0)
				{
					tag_heaps[main_tag_length].paths[i].sort_key-=10000.0;
				}
			}
		}
	}

	// sort tag heaps
	tag_round=0;
	for (tag_length=0; tag_length<max_tag_length; tag_length++)
	{	
		if (num_tags[tag_length]<=0)
			continue;

		vector<SeqPath>& bigger_tags = tag_heaps[tag_length].paths;
		sort(bigger_tags.begin(),bigger_tags.end(),comp_SeqPath_path_score);
		
		const float top_score = (bigger_tags.size()>0 ? bigger_tags[0].path_score : NEG_INF);
		int i;
		for (i=0; i<bigger_tags.size(); i++)
		{
			bigger_tags[i].org_rank=i;
			bigger_tags[i].delta_score = top_score - bigger_tags[i].path_score;
		}

		// check that tags are not covered by previous shorter ones
		if (tag_round>0)
		{
			
			const vector<SeqPath>& smaller_tags = final_tags;
			int i;
			for (i=0; i<bigger_tags.size(); i++)
			{
				const vector<PathPos>& big_positions = bigger_tags[i].positions;
				const int bigger_length = big_positions.size()-1;

				int j;
				for (j=0; j<smaller_tags.size(); j++)
				{
					const vector<PathPos>& small_positions = smaller_tags[j].positions;
					const int smaller_length = small_positions.size()-1;
					const int length_diff  = bigger_length - smaller_length;

					int k;
					for (k=0; k<length_diff; k++)
					{
						if (small_positions[0].aa == big_positions[k].aa)
						{
							int a;
							for (a=1; a<smaller_length; a++)
								if (small_positions[a].aa != big_positions[a+k].aa)
									break;
							if (a==smaller_length)
								break;
						}
					}
					if (k<length_diff)
						break;
				}

				// this tag is covered remove it
				if (j<smaller_tags.size())
				{
					bigger_tags[i]=bigger_tags[bigger_tags.size()-1];
					bigger_tags.pop_back();
					i--;
				}
			}
			
		}
	
		if (perform_denovo)
			determine_tag_containment_ratios(bigger_tags,denovo_seq_heap.paths, denovo_scores,0);

		// sort tags
		vector<score_pair> tag_scores;
		if (perform_tag_reranks[tag_length])
		{
			tag_rank_models[tag_length]->score_tag_sequences(bigger_tags,bs.ssf,bs.peaks,bs.num_peaks,tag_scores,size_idx);
			sort(tag_scores.begin(),tag_scores.end());
			int i;
			for (i=0; i<tag_scores.size(); i++)
				bigger_tags[tag_scores[i].idx].rerank_score = tag_scores[i].score;
		}
		else
		{
			int i;
			tag_scores.resize(bigger_tags.size());
			for (i=0; i<tag_scores.size(); i++)
			{
				tag_scores[i].idx=i;
				tag_scores[i].score=bigger_tags[i].path_score;
			}
		}

		while (tag_scores.size()>final_num_tags[tag_length])
			tag_scores.pop_back();

		for (i=0; i<tag_scores.size(); i++)
		{
			final_tags.push_back(bigger_tags[tag_scores[i].idx]);
		}
		tag_round++;	
	}
}











void output_denovo_solutions(SingleSpectrumFile *ssf, Config *config, ostream& out_stream,
							 const vector<SeqPath>& solutions, int max_num_sols)
{
	if (max_num_sols<0)
		max_num_sols=solutions.size();

	ssf->print_ssf_stats(config,out_stream);

	if (solutions.size() == 0)
	{
		out_stream << "No solutions found." << endl;
	}
	else 
	{
		out_stream << "#Index\tScore\tN-Gap\tC-Gap\t[M+H]\tCharge\tSequence" << endl;
		int i; 	
		for (i=0; i<solutions.size() && i<max_num_sols; i++) 
		{
			mass_t c_gap=solutions[i].pm_with_19 - solutions[i].c_term_mass;
			if (c_gap<24.0)
				c_gap = 0;

			out_stream << setprecision(3) << fixed << i << "\t";
			out_stream << solutions[i].path_score << "\t";
			out_stream << solutions[i].n_term_mass << "\t";
			out_stream << c_gap << "\t";
			out_stream << solutions[i].pm_with_19 << "\t";
			out_stream << solutions[i].charge << "\t";
			out_stream << solutions[i].seq_str;
			out_stream << endl;
		}
	}
	out_stream << endl;
}



void output_tag_solutions(SingleSpectrumFile *ssf, 
						  Config *config, ostream& out_stream,
							const vector<SeqPath>& solutions)
{
	ssf->print_ssf_stats(config,out_stream);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -