⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovorankscore.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:

	vector<int> ppp_frag_type_idxs;
	ppp_frag_type_idxs.clear();
	ppp_frag_type_idxs.push_back(0);
	ppp_frag_type_idxs.push_back(1);
	ppp_frag_type_idxs.push_back(2);
	ppp_frag_type_idxs.push_back(3);

	DeNovoPartitionModel *part_model = dnv_part_models[charge][size_idx];
	part_model->init_features(model_type,charge,size_idx,ppp_frag_type_idxs,config);
	

	// read sample peptide sequences
	create_complete_denovo_set_map(config,mgf_list,db_dir,correct_dir,
		denovo_dir,charge,size_idx,psm, file_indicators);
	

	mass_t min_mz=0;
	mass_t max_mz = 10000;
	if (size_idx>0)
		min_mz = (size_thresholds[charge][size_idx-1]+charge-1)/(mass_t)charge;
	if (size_idx< size_thresholds[charge].size())
		max_mz = (size_thresholds[charge][size_idx]+charge-1)/(mass_t)charge;

//	max_mz = min_mz + 30;
	
	cout << "Charge " << charge << " size " << size_idx << endl;
	cout << "Min m/z " << setprecision(2) << fixed << min_mz  << "  Max m/z " << max_mz << endl;

		// read spectra
	FileManager fm;
	FileSet	    fs;
	BasicSpecReader bsr;
	QCPeak		peaks[4000];

	fm.init_from_list_file(model->get_config(),mgf_list.c_str());
	fs.select_files_in_mz_range(fm,min_mz,max_mz,charge);
	
	static vector<PrmGraph *> prm_ptrs;
	static vector<SeqPath> solutions;
	const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
	int num_spectra_read=0;

	int num_sams=0;

	// Generate various types of samples from spectra
	int i;
	for (i=0; i<all_ssfs.size(); i++)
	{
		PeptideSetMap::const_iterator it;
		MGF_single *ssf = (MGF_single *)all_ssfs[i];
		scan_pair key(ssf->file_idx,ssf->idx_in_file);

		if (ssf->peptide.get_num_aas() != 12)
			continue;

		it = psm.find(key);
		if (it == psm.end())
			continue;

		const PeptideSet& set = (*it).second;
		BasicSpectrum     bs;
		AnnotatedSpectrum as;
		vector<PmcSqsChargeRes> pmc_sqs_res;

		int charge1=0,charge2=0;
		mass_t mz1=0,mz2=0;
		float prob1=0,prob2=0;

		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
		bs.peaks = peaks;
		bs.num_peaks = num_peaks;
		bs.ssf = ssf;

		model->get_best_mz_charge(config,bs, &mz1, &charge1, &prob1,
								  &mz2, &charge2, &prob2, &pmc_sqs_res);

		const mass_t mass_with_19 = mz1*charge1 - (charge1-1.0);
		as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
		as.set_corrected_pm_with_19(mass_with_19);
	
	
	
	
		vector<int> idxs;
		idxs.push_back(part_model->ann_peak_start_idx+6);
		idxs.push_back(part_model->ann_peak_start_idx+7);

		RankBoostSample corr_rbs;
		fill_complete_peptide_rbs(set.correct_sol, peaks, num_peaks, as, pmc_sqs_res, corr_rbs, size_idx);
	
		cout << ++num_sams << 
				"\t" << part_model->feature_names[idxs[0]] << 
				"\t" << part_model->feature_names[idxs[1]] << endl;

		cout << set.correct_sol.type;
		int k;
		for (k=0; k<idxs.size(); k++)
		{
			float val=NEG_INF;
			corr_rbs.get_feature_val(idxs[k],&val);
			cout << "\t" << val;
		}
		cout << "\t" << set.correct_sol.pep.as_string(config) << endl;


		// add db samples
		int num_db=0;
		int num_dnv=0;
		int j;
		for (j=0; j<set.incorrect_sols.size(); j++)
		{
			
			if (set.incorrect_sols[j].type == 1)
			{ 
				num_db++;
				if (num_db>5)
					continue;
			}

			if (set.incorrect_sols[j].type != 1)
			{ 
				num_dnv++;
				if (num_dnv==1)
					cout << endl;
			}
			if (num_dnv>5)
				break;

			RankBoostSample bad_rbs;
			fill_complete_peptide_rbs(set.incorrect_sols[j], peaks, num_peaks, as, pmc_sqs_res, 
				bad_rbs, size_idx);

			cout << set.incorrect_sols[j].type;
			int k;
			for (k=0; k<idxs.size(); k++)
			{
				float val=NEG_INF;
				bad_rbs.get_feature_val(idxs[k],&val);
				cout << "\t" << val + 0.05 * (set.incorrect_sols[j].type == 1 ? num_db : num_dnv);
			}
			cout << "\t" << set.incorrect_sols[j].pep.as_string(config) << endl;

			
		}

		if (num_sams==50)
			break;
	
	}
}



void create_bench_mgf(Config *config, 
					  char *file_list, 
					  char *out_name, 
					  int num_spectra,
					  bool use_exact_pm)
{	
	mass_t min_mz=0;
	mass_t max_mz = 1300;
	int charge =2;

	ofstream ofs(out_name);

	// read spectra
	FileManager fm;
	FileSet	    fs;
	BasicSpecReader bsr;
	QCPeak		peaks[4000];

	fm.init_from_list_file(config,file_list);
	fs.select_files_in_mz_range(fm,min_mz,max_mz,charge);
	fs.randomly_reduce_ssfs(int(num_spectra*1.07));

	const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
	int spec_idx;
	int num_written=0;
	vector<int> length_counts;
	length_counts.resize(100,0);
	for (spec_idx=0; spec_idx<all_ssfs.size() && num_written<num_spectra; spec_idx++)
	{
		MGF_single *ssf = (MGF_single *)all_ssfs[spec_idx];
		BasicSpectrum     bs;
		AnnotatedSpectrum as;
		PrmGraph prm;

		const vector<int>& aas = ssf->peptide.get_amino_acids();
		int a;
		for (a=0; a<aas.size(); a++)
			if (aas[a]>Val)
				break;
		if (a<aas.size())
			continue;

		const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks,false,true);
		bs.peaks = peaks;
		bs.num_peaks = num_peaks;
		bs.ssf = ssf;
		as.init_from_QCPeaks(config,peaks,num_peaks,ssf,false);

		ostringstream oss;
		oss << "spec_" << num_written;
		as.set_file_name(oss.str());

		if (use_exact_pm)
		{
			as.set_corrected_pm_with_19(as.get_true_mass_with_19());
			as.set_org_pm_with_19(as.get_true_mass_with_19());
			as.set_m_over_z((as.get_true_mass_with_19()+MASS_PROTON*(as.get_charge()-1))/as.get_charge());
		}

		as.output_as_MGF(ofs);

	//	cout << as.get_org_pm_with_19() << "\t" << as.get_peptide().get_mass_with_19() << "\t" <<
	//		as.get_peptide().as_string(config) << endl;

		num_written++;

		length_counts[as.get_peptide().get_num_aas()]++;
	}

	ofs.close();

	cout << "Wrote " << num_written << " to " << out_name << endl;
	cout << "Histogram: " << endl;
	int i;
	for (i=0; i<length_counts.size(); i++)
	{
		if (length_counts[i]>0)
		{
			cout << i << "\t" << length_counts[i] << "\t" << setprecision(1) << fixed << 100.0*length_counts[i]/(float)num_written << endl;
		}
	}
}


void run_peak_benchmark(AdvancedScoreModel *model, char *benchmark_file)
{
	const int max_rank = 7;
	const int num_ranks_to_consider = 50;

	Config *config = model->get_config();
	DeNovoRankScorer *dnv_rank = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
	PeakRankModel *prm = dnv_rank->get_peak_prediction_model(3);
	FileManager fm;
	FileSet		fs;
	int num_exact_first=0;
	vector<int> correct_rank_counts;
	correct_rank_counts.resize(10,0);

	fm.init_from_file(config,benchmark_file);
	fs.select_all_files(fm);
	fs.randomly_reduce_ssfs(2000);

	
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	vector<QCPeak> peaks;
	peaks.resize(5000);

	BasicSpecReader bsr;
	int i;
	for (i=0; i<all_ssf.size(); i++)
	{
		vector< vector<intensity_t> > ann_intens;
		vector< vector<mass_t> >	  ann_masses;
		AnnotatedSpectrum as;
		Peptide pep = all_ssf[i]->peptide;
		PeptideSolution sol;
		sol.pep = pep;
		sol.reaches_n_terminal=true;
		sol.reaches_c_terminal=true;
		sol.charge = all_ssf[i]->charge;
		sol.pm_with_19 = pep.get_mass_with_19();

		const int num_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],&peaks[0]);
		as.init_from_QCPeaks(config,&peaks[0],num_peaks,all_ssf[i]);
		as.set_peptide(sol.pep);
		as.annotate_spectrum(sol.pm_with_19, true);
		as.extract_annotated_intens_and_masses(ann_intens,ann_masses);

		PeptidePeakPrediction ppp;
		prm->calc_peptide_predicted_scores(sol, ppp);


		// reduce intensities to the same dimensionality
		const int num_frags = ppp.frag_idxs.size();
		vector< vector< float> > observed_intens;
		observed_intens.resize(num_frags);

		int f;
		for (f=0; f<num_frags; f++)
		{
			const int frag_idx = ppp.frag_idxs[f];
			observed_intens[f]=ann_intens[frag_idx]; 
		}

		// calculate the ranks and mapping between predicted and observed
		vector< vector<int> > observed_ranks, predicted_ranks;
		calc_combined_peak_ranks(observed_intens, observed_ranks);
		calc_combined_peak_ranks(ppp.rank_scores, predicted_ranks);

		vector<int> pred2obs, obs2pred;
		vector<int> num_obs_for_frag, num_pred_for_frag;
		vector<float> ordered_scores,  // scores sorted according to their value
					  obs_ordered_scores;  // scores sorted according to the observed intensity rank
		pred2obs.resize(num_ranks_to_consider,999); // look at top 50 peaks
		obs2pred.resize(num_ranks_to_consider,999);
		ordered_scores.resize(num_ranks_to_consider,NEG_INF);
		obs_ordered_scores.resize(num_ranks_to_consider,NEG_INF);
		num_obs_for_frag.resize(num_frags,0);
		num_pred_for_frag.resize(num_frags,0);


		for (f=0; f<num_frags; f++)
		{
			if (observed_ranks[f].size() != predicted_ranks[f].size())
			{
				cout << "#obs  frags: " << observed_ranks.size() << endl;
				cout << "#pred frags: " << predicted_ranks.size() << endl;
				cout << "Error: mismatch in rank dimensionalities!" << endl;
				cout << f << "\tobs : " << observed_ranks[f].size() << "   pred " << predicted_ranks[f].size() << endl;
				exit(1);
			}
			const int num_ranks = predicted_ranks[f].size();
			const vector<float>& frag_rank_scores = ppp.rank_scores[f];
			const vector<float>& frag_intens = observed_intens[f];
			int j;
			for (j=0; j<num_ranks; j++)
			{
				const int obs_rank  = observed_ranks[f][j];
				const int pred_rank = predicted_ranks[f][j];
				const float pred_score = frag_rank_scores[j];

				if (pred_rank<num_ranks_to_consider)
				{
					pred2obs[pred_rank]=obs_rank;
					ordered_scores[pred_rank]=pred_score;
				}

				if (obs_rank<num_ranks_to_consider)
				{
					obs2pred[obs_rank]=pred_rank;
					obs_ordered_scores[obs_rank]=pred_score;
				}

				if (frag_intens[j]>0)
					num_obs_for_frag[f]++;

				if (frag_rank_scores[j]>NEG_INF)
					num_pred_for_frag[f]++;
			}
		}


		int j;
	/*	cout << i << ":\t";
		for (j=0; j<7; j++)
			cout << obs2pred[j] << "\t";
		cout << endl;*/

		if (obs2pred[0] == 0)
		{
			num_exact_first++;
		}

		
		for (j=0; j<max_rank; j++)
		{
			int min=j-3;
			int max=j+3;
		//	if (min<0)
		//		max-=min;

			if (pred2obs[j]>=min && pred2obs[j]<=max)
				correct_rank_counts[j]++;
				
		}
	}

	cout << i;
	cout << setprecision(3) << fixed;
//	cout << num_exact_first/(float)all_ssf.size() << "\t";
	int j;
	for (j=0; j<max_rank; j++)
	{
		cout << " & " << (correct_rank_counts[j]/(float)all_ssf.size());
	}
	cout << "\\\\" << endl;
}


void make_peak_hist_for_obs_rank(AdvancedScoreModel *model, 
								 char *benchmark_file, 
								 int obs_rank)
{
	const int num_ranks_to_consider = 50;
	const int max_hist_rank = 20;

	Config *config = model->get_config();
	DeNovoRankScorer *dnv_rank = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
	PeakRankModel *prm = dnv_rank->get_peak_prediction_model(3);
	FileManager fm;
	FileSet		fs;
	int num_exact_first=0;
	vector<int> rank_hist;
	rank_hist.resize(max_hist_rank+1,0);

	fm.init_from_file(config,benchmark_file);
	fs.select_all_files(fm);
	fs.randomly_reduce_ssfs(10000);

	
	const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
	vector<QCPeak> peaks;
	peaks.resize(5000);

	BasicSpecReader bsr;
	int count =0;
	int i;
	for (i=0; i<all_ssf.size(); i++)
	{
		vector< vector<intensity_t> > ann_intens;
		vector< vector<mass_t> >	  ann_masses;
		AnnotatedSpectrum as;
		Peptide pep = all_ssf[i]->peptide;
		PeptideSolution sol;
		sol.pep = pep;
		sol.reaches_n_terminal=true;
		sol.reaches_c_terminal=true;
		sol.charge = all_ssf[i]->charge;
		sol.pm_with_19 = pep.get_mass_with_19();

		const int num_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],&peaks[0]);
		as.init_from_QCPeaks(config,&peaks[0],num_peaks,all_ssf[i]);
		as.set_peptide(sol.pep);
		as.annotate_spectrum(sol.pm_with_19, true);
		as.extract_annotated_intens_and_masses(ann_intens,ann_masses);

		PeptidePeakPrediction ppp;
		prm->calc_peptide_predicted_scores(sol, p

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -