⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pmc_rank.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 4 页
字号:
			sam.add_real_feature(r_idx++,stats.mean_offset_h2o_pairs/(1.0+stats.num_h2o_loss_frag_pairs));
		}
		else
			r_idx+=2;

		if (stats.mean_offset_c2_h2o_pairs<POS_INF)
		{
			sam.add_real_feature(r_idx++,stats.mean_offset_c2_h2o_pairs);
			sam.add_real_feature(r_idx++,stats.mean_offset_c2_h2o_pairs/(1.0+stats.num_h2o_loss_c2_frag_pairs));
		}
		else
			r_idx+=2;

		// individual offsets
		for (j=0; j<5 && j<stats.offset_pairs_ordered_by_inten.size(); j++)
			sam.add_real_feature(r_idx+j,stats.offset_pairs_ordered_by_inten[j]);
		r_idx+=5;

		for (j=0; j<5 && j<stats.c2_offset_pairs_ordered_by_inten.size(); j++)
			sam.add_real_feature(r_idx+j,stats.c2_offset_pairs_ordered_by_inten[j]);
		r_idx+=5;

	
		// add the +0 +1 +2 strict counts
		sam.add_real_feature(r_idx++,stats.num_strict_pairs0);
		sam.add_real_feature(r_idx++,stats.inten_strict_pairs0 * inten_norm);
	
		sam.add_real_feature(r_idx++,stats.num_strict_pairs1);
		sam.add_real_feature(r_idx++,stats.inten_strict_pairs1 * inten_norm);
	
		sam.add_real_feature(r_idx++,stats.num_strict_pairs2);
		sam.add_real_feature(r_idx++,stats.inten_strict_pairs2 * inten_norm);
	
		sam.add_real_feature(r_idx++,stats.c2_num_strict_pairs0);
		sam.add_real_feature(r_idx++,stats.c2_inten_strict_pairs0 * inten_norm);
	
		sam.add_real_feature(r_idx++,stats.c2_num_strict_pairs1);
		sam.add_real_feature(r_idx++,stats.c2_inten_strict_pairs1 * inten_norm);
	
		sam.add_real_feature(r_idx++,stats.c2_num_strict_pairs2);
		sam.add_real_feature(r_idx++,stats.c2_inten_strict_pairs2 * inten_norm);

		// add comparative features to -2 -1 +1 +2 Da away
		for (j=0; j<idx_offsets.size(); j++)
		{
			const int other_idx = i + idx_offsets[j];
			if (other_idx<0 || other_idx>= samples.size())
			{
				r_idx+=12;
				continue;
			}

			const PMCRankStats& other = curr_spec_rank_pmc_tables[charge][other_idx];

			sam.add_real_feature(r_idx++,stats.num_frag_pairs - other.num_frag_pairs);
			sam.add_real_feature(r_idx++,stats.num_strong_frag_pairs - other.num_strong_frag_pairs);
			sam.add_real_feature(r_idx++,stats.num_c2_frag_pairs - other.num_c2_frag_pairs);
			sam.add_real_feature(r_idx++,stats.num_strong_c2_frag_pairs - other.num_strong_c2_frag_pairs);
			sam.add_real_feature(r_idx++,stats.num_h2o_loss_frag_pairs - other.num_h2o_loss_frag_pairs);
			sam.add_real_feature(r_idx++,stats.num_h2o_loss_c2_frag_pairs - other.num_h2o_loss_c2_frag_pairs);

			sam.add_real_feature(r_idx++,(stats.inten_frag_pairs - other.inten_frag_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(stats.inten_strong_pairs - other.inten_strong_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(stats.inten_c2_pairs - other.inten_c2_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(stats.inten_c2_strong_pairs - other.inten_c2_strong_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(stats.inten_h2o_loss_frag_pairs - other.inten_h2o_loss_frag_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(stats.itnen_h2o_loss_c2_frag_pairs - other.itnen_h2o_loss_c2_frag_pairs) * inten_norm);
		}

		const int plus_idx = i + idx_skip;
		const int minus_idx = i-idx_skip;

		if (plus_idx<samples.size() && minus_idx>0)
		{
			const PMCRankStats& plus = curr_spec_rank_pmc_tables[charge][plus_idx];
			const PMCRankStats& minus = curr_spec_rank_pmc_tables[charge][minus_idx];

			sam.add_real_feature(r_idx++,plus.num_frag_pairs - minus.num_frag_pairs);
			sam.add_real_feature(r_idx++,plus.num_strong_frag_pairs - minus.num_strong_frag_pairs);
			sam.add_real_feature(r_idx++,plus.num_c2_frag_pairs - minus.num_c2_frag_pairs);
			sam.add_real_feature(r_idx++,plus.num_strong_c2_frag_pairs - minus.num_strong_c2_frag_pairs);
			sam.add_real_feature(r_idx++,plus.num_h2o_loss_frag_pairs - minus.num_h2o_loss_frag_pairs);
			sam.add_real_feature(r_idx++,plus.num_h2o_loss_c2_frag_pairs - minus.num_h2o_loss_c2_frag_pairs);

			sam.add_real_feature(r_idx++,(plus.inten_frag_pairs - minus.inten_frag_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(plus.inten_strong_pairs - minus.inten_strong_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(plus.inten_c2_pairs - minus.inten_c2_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(plus.inten_c2_strong_pairs - minus.inten_c2_strong_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(plus.inten_h2o_loss_frag_pairs - minus.inten_h2o_loss_frag_pairs) * inten_norm);
			sam.add_real_feature(r_idx++,(plus.itnen_h2o_loss_c2_frag_pairs - minus.itnen_h2o_loss_c2_frag_pairs) * inten_norm);
		}
	}
}


void init_PMC_feature_names(vector<string>& names)
{
	names.clear();
	int i;

	names.push_back("OFFSET FROM MEASURED M/Z");

	names.push_back("OFFSET FROM MEASURED M/Z, NUM PAIRS <=2");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM PAIRS <=4");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM PAIRS >4");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM STRONG PAIRS <3");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM STRONG PAIRS >=3");

	names.push_back("OFFSET FROM MEASURED M/Z, NUM C2 PAIRS <=2");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM C2 PAIRS <=4");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM C2 PAIRS >4");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM STRONG C2 PAIRS <3");
	names.push_back("OFFSET FROM MEASURED M/Z, NUM STRONG C2 PAIRS >=3");
	
	names.push_back("# PAIRS");
	names.push_back("# STRONG PAIRS");
	names.push_back("# C2 PAIRS");
	names.push_back("# STRONG C2 PAIRS");
	names.push_back("# H2O PAIRS");
	names.push_back("# C2 H2O PAIRS");

	names.push_back("INTEN PAIRS");
	names.push_back("INTEN STRONG PAIRS");
	names.push_back("INTEN C2 PAIRS");
	names.push_back("INTEN STRONG C2 PAIRS");
	names.push_back("INTEN H2O PAIRS");
	names.push_back("INTEN C2 H2O PAIRS");

	for (i=2; i<7; i++)
	{
		char name[64];
		sprintf(name,"AVG OFFSET TOP (STRONG %d)",i);
		names.push_back(name);
	}

	for (i=2; i<7; i++)
	{
		char name[64];
		sprintf(name,"AVG OFFSET TOP C2 (STRONG %d)",i);
		names.push_back(name);
	}

	names.push_back("MEAN OFFSET PAIRS");
	names.push_back("WEIGHTED MEAN OFFSET PAIRS");

	names.push_back("MEAN OFFSET STRONG PAIRS");
	names.push_back("WEIGHTED MEAN OFFSET STRONG PAIRS");

	names.push_back("MEAN OFFSET C2 PAIRS");
	names.push_back("WEIGHTED MEAN OFFSET C2 PAIRS");

	names.push_back("MEAN OFFSET STRONG C2 PAIRS");
	names.push_back("WEIGHTED MEAN OFFSET STRONG C2 PAIRS");

	names.push_back("MEAN OFFSET H2O PAIRS");
	names.push_back("WEIGHTED MEAN OFFSET H2O PAIRS");

	names.push_back("MEAN OFFSET C2 H2O PAIRS");
	names.push_back("WEIGHTED MEAN OFFSET C2 H2O PAIRS");

	for (i=0; i<5; i++)
	{
		char name[64];
		sprintf(name,"PAIR OFFSET (STRONG %d)",i+1);
		names.push_back(name);
	}

	for (i=0; i<5; i++)
	{
		char name[64];
		sprintf(name,"C2 PAIR OFFSET (STRONG %d)",i+1);
		names.push_back(name);
	}



	names.push_back("NUM STRICT 0");
	names.push_back("INTEN STRICT 0");

	names.push_back("NUM STRICT 1");
	names.push_back("INTEN STRICT 1");

	names.push_back("NUM STRICT 2");
	names.push_back("INTEN STRICT 2");

	names.push_back("NUM C2 STRICT 0");
	names.push_back("INTEN C2 STRICT 0");

	names.push_back("NUM C2 STRICT 1");
	names.push_back("INTEN C2 STRICT 1");

	names.push_back("NUM C2 STRICT 2");
	names.push_back("INTEN C2 STRICT 2");

	// diff features with -2 -1 +1 +2
	const string dis_labels[]={"-2","-1","+1","+2"};
	for (i=0; i<4; i++)
	{
		const string prefix = "DIFF WITH "+dis_labels[i]+" ";

		names.push_back(prefix+"# PAIRS");
		names.push_back(prefix+"# STRONG PAIRS");
		names.push_back(prefix+"# C2 PAIRS");
		names.push_back(prefix+"# STRONG C2 PAIRS");
		names.push_back(prefix+"# H2O PAIRS");
		names.push_back(prefix+"# C2 H2O PAIRS");

		names.push_back(prefix+"INTEN PAIRS");
		names.push_back(prefix+"INTEN STRONG PAIRS");
		names.push_back(prefix+"INTEN C2 PAIRS");
		names.push_back(prefix+"INTEN STRONG C2 PAIRS");
		names.push_back(prefix+"INTEN H2O PAIRS");
		names.push_back(prefix+"INTEN C2 H2O PAIRS");
	}

	names.push_back("DIFF +1/-1 # PAIRS");
	names.push_back("DIFF +1/-1 # STRONG PAIRS");
	names.push_back("DIFF +1/-1 # C2 PAIRS");
	names.push_back("DIFF +1/-1 # STRONG C2 PAIRS");
	names.push_back("DIFF +1/-1 # H2O PAIRS");
	names.push_back("DIFF +1/-1 # C2 H2O PAIRS");

	names.push_back("DIFF +1/-1 INTEN PAIRS");
	names.push_back("DIFF +1/-1 INTEN STRONG PAIRS");
	names.push_back("DIFF +1/-1 INTEN C2 PAIRS");
	names.push_back("DIFF +1/-1 INTEN STRONG C2 PAIRS");
	names.push_back("DIFF +1/-1 INTEN H2O PAIRS");
	names.push_back("DIFF +1/-1 INTEN C2 H2O PAIRS");
	cout << "Initialized: " << names.size() << " real feature names..." << endl;
}




void PMCSQS_Scorer::output_pmc_rank_results(const FileManager& fm,
											int charge,
											const vector<SingleSpectrumFile *>& test_ssfs) 
{
	BasicSpecReader bsr;
	static QCPeak peaks[5000];

	vector<int> org_offset_counts, new_offset_counts;
	org_offset_counts.resize(201,0);
	new_offset_counts.resize(201,0);

	vector<mass_t> org_offsets;
	vector<mass_t> corr_offsets;

	org_offsets.clear();
	corr_offsets.clear();

	int i;
	for (i=0; i<test_ssfs.size(); i++)
	{
		SingleSpectrumFile* ssf = test_ssfs[i];
		BasicSpectrum bs;
	
		bs.num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
		bs.peaks = peaks;
		bs.ssf = ssf;

		init_for_current_spec(config,bs);
		calculate_curr_spec_pmc_values(bs, bin_increment);

		PmcSqsChargeRes res;
		find_best_mz_values_from_rank_model(bs, charge, res);

		ssf->peptide.calc_mass(config);
		mass_t true_mz = (ssf->peptide.get_mass() + 18.01 + charge)/charge;

		org_offsets.push_back(true_mz - ssf->m_over_z);
		corr_offsets.push_back(true_mz - res.mz1);
	}

	mass_t m_org,sd_org,m_corr,sd_corr;
	calc_mean_sd(org_offsets,&m_org, &sd_org);
	calc_mean_sd(corr_offsets,&m_corr,&sd_corr);

	cout << "CHARGE: " << charge << endl;
	cout << "ORG:  mean " << m_org << " " << sd_org << endl;
	cout << "CORR: mean " << m_corr << " " << sd_corr << endl;

	for (i=0; i<org_offsets.size(); i++)
	{
		int org_idx = 100 + int(org_offsets[i] * 20);
		if (org_idx<0)
			org_idx = 0;
		if (org_idx>200)
			org_idx=200;
		org_offset_counts[org_idx]++;

		int new_idx = 100 + int(corr_offsets[i] * 20);
		if (new_idx<0)
			new_idx = 0;
		if (new_idx>200)
			new_idx=200;
		new_offset_counts[new_idx]++;
	}

	int cum_org=0;
	int cum_new=0;
	for (i=0; i<=200; i++)
	{

		if (org_offset_counts[i]==0 && new_offset_counts[i]==0)
			continue;
		
		cum_org+=org_offset_counts[i];
		cum_new+=new_offset_counts[i];
		cout << fixed << setprecision(3) << i*0.05 - 5.0 << "\t" <<
			org_offset_counts[i]/(float)org_offsets.size() << "\t" <<
			new_offset_counts[i]/(float)corr_offsets.size() << "\t" <<
			cum_org/(float)org_offsets.size() << "\t"<<
			cum_new/(float)corr_offsets.size() << endl;

	}


}


void PMCSQS_Scorer::find_best_mz_values_from_rank_model(const BasicSpectrum& bs, 
										int charge, PmcSqsChargeRes& res)
{
	static vector<RankBoostSample> spec_samples;
	static vector<float> rank_scores;

	fill_RankBoost_smaples_with_PMC(bs, charge, spec_samples);

	if (rank_scores.size() != spec_samples.size())
		rank_scores.resize(spec_samples.size(),NEG_INF);
	
	int best_idx=-1;
	float best_score=NEG_INF;
	int i;
	for (i=0; i<spec_samples.size(); i++)
	{
		const mass_t pm_with_19 = bs.ssf->m_over_z * charge - (charge + 1);
		const int size_idx = get_rank_model_size_idx(charge, pm_with_19);

		if (charge>= pmc_rank_models.size() ||
			size_idx>= pmc_rank_models[charge].size() ||
			! pmc_rank_models[charge][size_idx])
			continue;


		rank_scores[i]=pmc_rank_models[charge][size_idx]->calc_rank_score(spec_samples[i]);
		curr_spec_rank_pmc_tables[charge][i].rank_score = rank_scores[i];
		if (rank_scores[i]>best_score)
		{
			best_score=rank_scores[i];
			best_idx = i;
		}
	}

	// no suitable models were found for this spectrum
	if (best_idx<0)
	{
		res.mz1 = bs.ssf->m_over_z;
		res.score1 = 10.0;
		res.mz2 = NEG_INF;
		res.score2 = NEG_INF;
		return;
	}

	
	res.mz1 = curr_spec_rank_pmc_tables[charge][best_idx].m_over_z;
	res.score1 = best_score;

	// look for additional m/z
	int second_best_idx=-1;
	float second_best_score=NEG_INF;

	const mass_t mz_diff = curr_spec_rank_pmc_tables[charge][1].m_over_z - 
						   curr_spec_rank_pmc_tables[charge][0].m_over_z;

	const int idx_diff = (int)(0.45/(charge * mz_diff));
	for (i=0; i<spec_samples.size(); i++)
	{
		if (fabs(i-best_idx)<idx_diff)
			continue;

		if (rank_scores[i]>second_best_score)
		{
			second_best_score=rank_scores[i];
			second_best_idx = i;
		}

	}
 
	if (second_best_idx>=0)
	{
		res.mz2 = curr_spec_rank_pmc_tables[charge][second_best_idx].m_over_z;
		res.score2 = second_best_score;
	} 
	else
	{
		res.mz2 = NEG_INF;
		res.score2 = NEG_INF;
	}

//	const int size_idx = this->get_rank_model_size_idx(charge,res.mz1*charge-charge+1);
//	res.mz1 -= pmc_charge_mz_biases[charge][size_idx];
//	res.mz2 -= pmc_charge_mz_biases[charge][size_idx];

//	cout << charge << " ]\t" << res.mz1 << "\t" << res.prob1 << "\t" << res.mz2 << "\t" << res.prob2 << endl;


}




⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -