⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovorankscore.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
	{
		cout << "Error: peak model not initialized!" << endl;
		exit(1);
	}

	if (! comp_assigner)
	{
		cout << "Error: comp assigner not initialized!" << endl;
		exit(1);
	}

	string dir = model->get_config()->get_resource_dir();
	string base_path = dir + "/" + name;
	string model_file_name = base_path + "_model.txt";

	ofstream ofs(model_file_name.c_str());
	if (! ofs.is_open() || ! ofs.good())
	{
		cout << "Error: couldn't open file for reading: " << model_file_name << endl;
		exit(1);
	}

	ofs << name << " " << model_type << endl;
	ofs << model->get_max_score_model_charge() << endl;
	ofs << model->get_model_name() << endl;
	ofs << peak_prediction_models[model_type]->get_peak_rank_model_name() << endl;
	ofs << comp_assigner->get_model_name() << endl;

	int c;
	for (c=0; c<this->dnv_part_models.size(); c++)
	{
		int size_idx;
		for (size_idx=0; size_idx<dnv_part_models[c].size(); size_idx++)
		{
			if (dnv_part_models[c][size_idx] && 
				dnv_part_models[c][size_idx]->ind_was_initialized)
			{
				ostringstream oss;
				oss << base_path << "_" << c << "_" << size_idx << ".txt";
				string path = oss.str();
				dnv_part_models[c][size_idx]->write_denovo_partition_model(this->model_type, path.c_str());
			}
		}
	}
}



/*********************************************************************
Adds the counts for peaks around a breakage to their respective bins

**********************************************************************/
void add_offset_counts_for_unannotated_peaks_arround_mass(vector<int>& counts, 
									AnnotatedSpectrum *spec,
									mass_t min_mass, 
									mass_t max_mass, 
									mass_t bin_coef, 
									mass_t break_mass,
									int charge)
{
	const vector< vector<PeakAnnotation> >& peak_anns = spec->get_peak_annotations();
	const mass_t low_range = (break_mass + min_mass+1)/(mass_t)charge;
	const mass_t high_range = (break_mass + max_mass)/(mass_t)charge;
	const PeakRange pr = spec->get_peaks_in_range(low_range, high_range);

//	cout << spec->get_peptide().as_string(spec->get_config()) << " " << break_mass << endl;
	
	if (pr.num_peaks<=0)
		return;

	int skipped=0;

	// add counts
	int p_idx;
	for (p_idx = pr.low_idx; p_idx<=pr.high_idx; p_idx++)
	{
		if (spec->get_peak_iso_level(p_idx)>0)
			continue;

		if (peak_anns[p_idx].size()>0)
		{
			skipped++;
			continue;
		}
	
		const mass_t peak_mass = spec->get_peak_mass(p_idx);
		const mass_t b_mass = break_mass/(mass_t)charge;
		const int bin = (int)((peak_mass - b_mass - min_mass)*bin_coef);
		
		counts[bin]+=10;
		counts[bin-1]+=9;
		counts[bin+1]+=9;
		counts[bin-2]+=6;
		counts[bin+2]+=6;
		counts[bin-3]+=4;
		counts[bin+3]+=4;
		counts[bin-4]+=2;
		counts[bin+4]+=2;
	}
}


/***********************************************************************
Uses the offset count method to determine if there are any fragments
that are special for a given PTM (function only prints a list of
the most interesting cases).
************************************************************************/
void find_special_PTM_frags_using_offset_counts(
										   const string& PTM_label,
										   FileManager& fm,
										   const vector<SingleSpectrumFile *>& all_ssfs,
										   Model *model,
										   int max_charge)
{
	Config *config = model->get_config();
	const mass_t min_offset_mass = -120;
	const mass_t max_offset_mass = 120;
	const mass_t tolerance = config->get_tolerance();
	const mass_t bin_size = tolerance * 0.1;
	const mass_t bin_coef = 1.0 / bin_size;
	const int count_size = (int)((max_offset_mass - min_offset_mass + 1) / bin_size);
	vector< vector< vector<int> > > prefix_counts, suffix_counts; // charge, distance from cut, bin_idx
	vector< vector< int > > pre_instances, suf_instances;
	int i,c,d;

	float min_frag_prob = 0.02;

	const int ptm_aa = config->get_aa_from_label(PTM_label);
	if (ptm_aa <0)
	{
		cout << "Error: PTM not supported in this model: " << PTM_label << endl;
		exit(1);
	}

	const int max_distance = 1;
	
	prefix_counts.resize(max_charge+1);
	suffix_counts.resize(max_charge+1);
	pre_instances.resize(max_charge+1);
	suf_instances.resize(max_charge+1);

	for (c=1; c<=max_charge; c++)
	{
		prefix_counts[c].resize(max_distance+1);
		suffix_counts[c].resize(max_distance+1);
		pre_instances[c].resize(max_distance+1,0);
		suf_instances[c].resize(max_distance+1,0);
		for (d=0; d<=max_distance; d++)
		{
			prefix_counts[c][d].resize(count_size,0);
			suffix_counts[c][d].resize(count_size,0);
		}
	}

	vector<QCPeak> peaks;
	BasicSpecReader bsr;

	peaks.resize(5000);

	int spectra_used=0;
	for (i=0; i<all_ssfs.size(); i++)
	{
		AnnotatedSpectrum as;
		vector<mass_t> break_masses;

		const Peptide& pep = all_ssfs[i]->peptide;
		const vector<int>& amino_acids = pep.get_amino_acids();
		int aa_idx;
		for (aa_idx = 0; aa_idx<amino_acids.size(); aa_idx++)
			if (amino_acids[aa_idx] == ptm_aa)
				break;
		if (aa_idx == amino_acids.size())
			continue;
		
		int num_peaks = bsr.read_basic_spec(config,fm,all_ssfs[i],&peaks[0]);
		as.init_from_QCPeaks(config,&peaks[0],num_peaks,all_ssfs[i]);
		as.set_peptide(pep);
		as.annotate_spectrum(pep.get_mass_with_19(),true);
		spectra_used++;

		
		pep.calc_expected_breakage_masses(config,break_masses);
		const mass_t true_mass = as.get_peptide().get_mass();
		for (aa_idx=0; aa_idx<amino_acids.size(); aa_idx++)
		{
			if (amino_acids[aa_idx] != ptm_aa)
				continue;

			int d;
			for (d=0; d<=max_distance; d++)
			{
				int cut_idx = aa_idx + 1 -d;
				if (cut_idx<1)
					break;

				int c;
				const mass_t break_mass = break_masses[cut_idx];
				for (c=1; c<=max_charge; c++)
				{
				//	cout << "P: " << c << " " << d << " ";
					add_offset_counts_for_unannotated_peaks_arround_mass(prefix_counts[c][d], &as,
						min_offset_mass, max_offset_mass, bin_coef, break_mass,c);
					pre_instances[c][d]++;
				}
			}

			for (d=0; d<=max_distance; d++)
			{
				int cut_idx = aa_idx + d;
				if (cut_idx>amino_acids.size())
					break;
				
				int c;
				const mass_t break_mass = break_masses[cut_idx];
				for (c=1; c<=max_charge; c++)
				{
				//	cout << "S: " << c << " " << d << " ";
					add_offset_counts_for_unannotated_peaks_arround_mass(suffix_counts[c][d], &as,
						min_offset_mass, max_offset_mass, bin_coef, (true_mass - break_mass),c);
					suf_instances[c][d]++;
				}
			}
		}
	}

	cout << "Using: " << spectra_used << " spectra for offset counts..." << endl;

	// select 30 top fragments becasue many are likely to be caused by previous/next
	// amino acids and will be later removed
	vector<FragmentTypeSet> pre_fts , suf_fts;

	pre_fts.resize(max_distance+1);
	suf_fts.resize(max_distance+1);
	for (c=1; c<=max_charge; c++)
	{
		int d;
		for (d=0; d<=max_distance; d++)
		{
			select_fragments_from_bins(prefix_counts[c][d],pre_fts[d],30,c,PREFIX,min_offset_mass,bin_coef,tolerance);
			select_fragments_from_bins(suffix_counts[c][d],suf_fts[d],30,c,SUFFIX,min_offset_mass,bin_coef,tolerance);

		/*	int f;
			for (f=0; f<pre_fts[d].get_num_fragments(); f++)
			{
				FragmentType& frag = pre_fts[d].get_non_const_fragment(f);
				frag.prob = frag.prob / (float)pre_instances[frag.charge][d];
			}
			pre_fts[d].sort_fragments_according_to_probs();

			for (f=0; f<suf_fts[d].get_num_fragments(); f++)
			{
				FragmentType& frag = suf_fts[d].get_non_const_fragment(f);
				frag.prob = frag.prob / (float)suf_instances[frag.charge][d];
			}
			suf_fts[d].sort_fragments_according_to_probs();*/
		}
	}


	for (d=0; d<=max_distance; d++)
	{
	//	calculate_true_fragment_probabilities(fm,config,fts[d], min_frag_prob);

		cout << "Fragments selected from spectra [distance " << d << "]:" << endl;
		for (i=0; i<pre_fts[d].get_num_fragments() && i<5; i++)
		{
			const FragmentType& frag = pre_fts[d].get_fragment(i);
			cout << left << setw(3) << i << right << setw(5) << frag.spec_count << " ";
			cout << setw(6) << setprecision(3) << right << frag.prob << " ";
			frag.write_fragment(cout);
		}
		cout << endl;

		for (i=0; i<suf_fts[d].get_num_fragments() && i<5; i++)
		{
			const FragmentType& frag = suf_fts[d].get_fragment(i);
			cout << left << setw(3) << i << right << setw(5) << frag.spec_count << " ";
			cout << setw(6) << setprecision(3) << right << frag.prob << " ";
			frag.write_fragment(cout);
		}
		cout << endl;
	}
}





/****************************************************************************
Runs benchmark on spectra with a complete de novo solution in there
*****************************************************************************/
void benchmark_ranking_on_full_denovo(AdvancedScoreModel *model,
									  char *mgf_test_file, 
									  int max_num_spectra,
									  int num_solutions,
									  char *report_path,
									  int min_length,
									  int max_length)
{
	const int charge = 2;

	Config *config;

	char report_file_path[512];
	char *report_name=NULL;

	ofstream out_file_stream;
	
	if (report_path)
	{
		sprintf(report_file_path,"%s_bench_test.txt",report_path);
		report_name = report_file_path;
		out_file_stream.open(report_file_path);
		cout << "Writing results to " << report_file_path << endl;
	}
	else
		cout << "Writing results to cout..." << endl;


	
	ostream& sum_stream	= (report_path ? out_file_stream : cout);

	FileManager fm;
	FileSet		fs;
	vector<PrmGraph *> prm_ptrs;
	PrmGraph opt_prm;

	config = model->get_config();
	config->set_use_spectrum_charge(2);
	const mass_t tolerance = config->get_tolerance();


	fm.init_from_file(config,mgf_test_file);
	fs.select_all_files(fm);
	fs.randomly_reduce_ssfs((int)(max_num_spectra * 1.5));
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	
	// 0 all seqs , 1 complete seqs, 2 all cuts ,3 complete cuts
	vector< vector<int> > top_rerank, top_denovo;
	vector<double> total_corr_denovo, total_pred_denovo, total_corr_rerank, total_pred_rerank;

	top_rerank.resize(4);
	top_denovo.resize(4);
	total_corr_denovo.resize(4,0);
	total_pred_denovo.resize(4,0);
	total_corr_rerank.resize(4,0);
	total_pred_rerank.resize(4,0);

	BasicSpecReader bsr;
	QCPeak peaks[4000];

	int num_spec_tested=0;
	int num_top_denovo_correct=0;
	int num_top_rerank_correct=0;

	sum_stream << "Test file     : " << mgf_test_file << endl;
	sum_stream << "Num cases     : " << max_num_spectra << endl;
	sum_stream << "Num solutions : " << num_solutions << endl << endl;
	sum_stream << "START TESTING..." << endl;
	double start_t = time(NULL);

	double num_pred_aa=0;

	int spec_idx;
	for (spec_idx=0; spec_idx<all_ssf.size(); spec_idx++)
	{
	
		MGF_single *ssf = (MGF_single *)all_ssf[spec_idx];
		const string corr_pep_str = ssf->peptide.as_string(config);
		const Peptide& full_pep = ssf->peptide;
		const mass_t true_mass_with_19 = full_pep.get_mass_with_19();
		const int correct_pep_length = ssf->peptide.get_num_aas();
		BasicSpectrum bs;
		Spectrum s;

		// exclude pepitdes with M+16/Q-17
		const vector<int>& aas = ssf->peptide.get_amino_acids();
		int a;
		for (a=0; a<aas.size(); a++)
			if (aas[a]>Val)
				break;
		if (a<aas.size())
			continue;


		bs.ssf = all_ssf[spec_idx];
		bs.peaks = peaks;
		bs.num_peaks = bsr.read_basic_spec(config,fm,bs.ssf,bs.peaks);
		s.init_from_QCPeaks(config,bs.peaks,bs.num_peaks,ssf);


		// create the correct pm_graph, test for presence of optimal solutions
		opt_prm.clear();
		opt_prm.create_graph_from_spectrum(model,&s,s.get_true_mass_with_19(),s.get_charge());
		SeqPath longest = opt_prm.get_longest_subpath(ssf->peptide,0);

	//	if (longest.get_num_aa()<6)
	//		continue;

		bool has_complete_path = false;
		if (longest.get_num_aa() == correct_pep_length)
			has_complete_path = true;

		// generate de novo solutions
		vector<PmcSqsChargeRes> pmc_sqs_res;
		vector<mass_t> pms_with_19;
		vector<int>    charges;
		model->select_pms_and_charges(config,bs,pms_with_19,charges,true);
		

		if (prm_ptrs.size()<pms_with_19.size())
			prm_ptrs.resize(pms_with_19.size(),NULL);
		
		vector<SeqPath> solutions;
		generate_denovo_solutions_from_several_pms(
				prm_ptrs,
				model,
				&s,
				true, 
				num_solutions,
				6,
				14,
				pms_with_19,
				charges,
				solutions,
				false);

		vector<mass_t> exp_cut_masses;
		full_pep.calc_expected_breakage_masses(config, exp_cut_masses);


//		if (solutions.size()>0)
//			num_pred_aa += solutions[0].get_num_aa();

		int min_denovo_correct_rank=NEG_INF;
		int min_denovo_cut_correct_rank=POS_INF;

		int s_idx;
		bool has_correct_path_in_results=false;
		for (s_idx=0; s_idx<solutions.size(); s_idx++)
		{
			if (solutions[s_idx].check_if_correct(corr_pep_str,config))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -