⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 denovopartmodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
	if (oris.size()>0)
	{
		const float one_over = 1.0/(float)oris.size();
		rbs.add_real_feature(f_idx++,(float)num_with1*one_over);
		rbs.add_real_feature(f_idx++,(float)num_with2*one_over);
		rbs.add_real_feature(f_idx++,(float)num_with_alot*one_over);
		rbs.add_real_feature(f_idx++,num_dual_ori*one_over);
	}
	else
		f_idx+=4;
	rbs.add_real_feature(f_idx++,(float)switches);

/*	feature_names.push_back("PRM #breakages with 1 frag detected");
	feature_names.push_back("PRM #breakages with 2 frag detected");
	feature_names.push_back("PRM #breakages with > 5 frags detected");

	feature_names.push_back("PRM #breakages with dual orientation frags");
	feature_names.push_back("PRM #orientation switches");*/
}


void DeNovoPartitionModel::fill_pmcsqs_features(
		const PeptideSolution& sol,
		const vector<PmcSqsChargeRes>& res,
		const PMCSQS_Scorer *pmc_model,
		RankBoostSample& rbs) const
{
	int f_idx = pmc_start_idx;

	const mass_t& mz1 = res[sol.charge].mz1;
	const mass_t& mz2 = res[sol.charge].mz2;
	const mass_t pm1_with_19 = (mz1>0 ? mz1 * sol.charge - (sol.charge-1)*MASS_PROTON : NEG_INF);
	const mass_t pm2_with_19 = (mz2>0 ? mz2 * sol.charge - (sol.charge-1)*MASS_PROTON : NEG_INF); 
	const float prob_charge = res[sol.charge].min_comp_prob;
	
	if (mz1>0)
	{
		rbs.add_real_feature(f_idx,res[sol.charge].sqs_prob);
		rbs.add_real_feature(f_idx+1,prob_charge);

		if (prob_charge>0.95)
		{
			rbs.add_real_feature(f_idx+2,pm1_with_19-sol.pep.get_mass_with_19());
		}
		else
			rbs.add_real_feature(f_idx+3,pm1_with_19-sol.pep.get_mass_with_19());

		rbs.add_real_feature(f_idx+4,res[sol.charge].score1);
	}
	
	f_idx+=5;

	if (mz2>0)
	{
		rbs.add_real_feature(f_idx++,res[sol.charge].score2);
		rbs.add_real_feature(f_idx++,pm2_with_19-sol.pep.get_mass_with_19());
	}
	else
		f_idx+=2;

	float max_prob=-1;
	int c;
	for (c=1; c<res.size(); c++)
	{
		if (c==sol.charge)
			continue;
		if (res[c].min_comp_prob>max_prob)
			max_prob=res[c].min_comp_prob;
	}

	if (max_prob>=0)
		rbs.add_real_feature(f_idx,max_prob);
	f_idx++;

	const vector< vector< PMCRankStats > >& curr_stats = pmc_model->get_curr_spec_rank_pmc_tables();
	const vector< PMCRankStats >& charge_stats = curr_stats[sol.charge];

	int i;
	int mz_idx =0;
	while (mz_idx<charge_stats.size() && charge_stats[mz_idx].m_over_z>mz1)
		mz_idx++;

	if (mz_idx>0 && mz_idx<charge_stats.size() && 
		charge_stats[mz_idx].m_over_z-mz1>mz1-charge_stats[mz_idx-1].m_over_z)
		mz_idx--;


	int best_idx=0;
	for (i=0; i<charge_stats.size(); i++)
		if (charge_stats[i].rank_score>charge_stats[best_idx].rank_score)
			best_idx = i;
	
	const float score_diff = charge_stats[best_idx].rank_score - charge_stats[mz_idx].rank_score;
	if (prob_charge>0.95)
	{
		rbs.add_real_feature(f_idx,score_diff);
	}
	else if (prob_charge>0.7)
	{
		rbs.add_real_feature(f_idx+1,score_diff);
	}
	else
	{
		rbs.add_real_feature(f_idx+2,score_diff);
	}
	f_idx+=3;	
}

void DeNovoPartitionModel::fill_composition_features(
							const PeptideSolution& sol,
							Config *config,
							PeptideCompAssigner *comp_assigner,
							const SeqPath& path,
							RankBoostSample& rbs) const
{
	PeptideCompStats comp_stats;

	comp_assigner->fill_peptide_stats(sol.pep,comp_stats);

	const int num_aas = sol.pep.get_num_aas();
	const vector<int> sol_aas = sol.pep.get_amino_acids();

	int f_idx = comp_start_idx;

	if (sol.reaches_n_terminal)
		rbs.add_real_feature(f_idx,(float)comp_stats.start_comp[3]);
	f_idx++;

	if (sol.reaches_c_terminal)
		rbs.add_real_feature(f_idx,(float)comp_stats.end_comp[3]);
	f_idx++;

	const int *len3_counts = comp_stats.cat_counts[3];

	if (1)
	{
		rbs.add_real_feature(f_idx++,(float)(len3_counts[19]+len3_counts[20]));
		rbs.add_real_feature(f_idx++,(float)(len3_counts[15]+len3_counts[16]+len3_counts[17]+len3_counts[18]));
		rbs.add_real_feature(f_idx++,(float)(len3_counts[7]+len3_counts[8]+len3_counts[9]+len3_counts[10]+
											 len3_counts[11]+len3_counts[12]+len3_counts[13]+len3_counts[14]));
		rbs.add_real_feature(f_idx++,(float)(len3_counts[3]+len3_counts[4]+len3_counts[5]+len3_counts[6]));
		rbs.add_real_feature(f_idx++,(float)(len3_counts[1]+len3_counts[2]));
	}
	else
		f_idx+=5;
	
	int avg = 0;
	int cat;
	for (cat=1; cat<=MAX_COMP_CAT; cat++)
	{
		avg+=cat*len3_counts[cat];
		if (len3_counts[cat]>0)
			break;
	}
	int min_cat = cat++;
	for ( ; cat<=MAX_COMP_CAT; cat++)
		avg+=cat*len3_counts[cat];

	if (1)
		rbs.add_real_feature(f_idx,min_cat);
	f_idx++;

	if (num_aas>0)
		rbs.add_real_feature(f_idx,avg/(float)num_aas);
	f_idx++;
	

	vector<score_pair> pairs;
	if (path.positions.size()>3)
	{
		const int start_idx = 1;
		const int end_idx   = path.positions.size()-2;

		int i;
		for (i=start_idx; i<end_idx; i++)
		{
			const PathPos& pos = path.positions[i];
			if (pos.node_idx>0)
			{
				score_pair p;
				p.idx = i;
				p.score = pos.node_score;
				pairs.push_back(p);
			}
		}
		sort(pairs.begin(),pairs.end());

		for (i=0; i<pairs.size() && i<4; i++)
		{
			const int idx = pairs[i].idx;
			int prev_idx;
			for (prev_idx = idx-1; prev_idx>=0; prev_idx--)
				if (path.positions[prev_idx].edge_idx>=0)
					break;

			int prev_cat = -1;
			if (prev_idx>=0 && idx- prev_idx<=3)
			{
				prev_cat = comp_assigner->get_aa_category(idx-prev_idx,&sol_aas[prev_idx],
					(prev_idx == 0 && sol.reaches_n_terminal), false);
			}

			int next_cat = -1;
			if (path.positions[idx].edge_idx>=0)
			{
				int next_idx;
				for (next_idx = idx+1; next_idx<=num_aas; next_idx++)
					if (path.positions[next_idx].node_idx>0)
						break;
				if (next_idx<=num_aas && next_idx-idx<=3)
				{
					next_cat = comp_assigner->get_aa_category(next_idx-idx,&sol_aas[idx],
					false, (next_idx == num_aas && sol.reaches_c_terminal));
				}
			}

			int span_aas[2]={path.positions[idx-1].aa,path.positions[idx].aa};
			int span_cat = comp_assigner->get_aa_category(2,span_aas, false, false);

			rbs.add_real_feature(f_idx+i*3,(float)prev_cat);
			rbs.add_real_feature(f_idx+i*3+1,(float)next_cat);
			rbs.add_real_feature(f_idx+i*3+2,(float)span_cat);
		}
	}
	f_idx+=12;

	const vector<int>& org_aas = config->get_org_aa();
	vector<int> aa_counts;
	int max_val=2;
	if (sol_aas.size()>14)
		max_val=3;
	if (sol_aas.size()>22)
		max_val=4;
	aa_counts.resize(Val+1,0);
	int i;
	for (i=0; i<sol_aas.size(); i++)
		aa_counts[org_aas[sol_aas[i]]]++;
	aa_counts[Leu]+=aa_counts[Ile];

	int c=0;
	for (i=Ala; i<=Val; i++)
	{
		if (i==Ile)
			continue;
		
		if (aa_counts[i]>0)
		{
			rbs.add_real_feature(f_idx+c,(aa_counts[i]>max_val ? max_val : aa_counts[i]));
		}
		c++;
	}
	f_idx+=c;

	const int max_idx = sol_aas.size()-1;
	int num_W=0;
	int num_Q=0;
	int num_N=0;
	int num_XG=0;
	for (i=0; i<max_idx; i++)
	{
		if (path.positions[i].edge_idx>=0)
			continue;

		const int aa1=sol_aas[i];
		const int aa2=sol_aas[i+1];

		if (aa2 == Gly && (aa1 == Glu || aa1 == Gly || aa1 == Ala))
			num_XG++;

		if (aa1 == Gly && aa2 == Gly)
		{
			num_N++;
			continue;
		}

		if ((aa1 == Ala && aa2 == Gly) ||
			(aa1 == Gly && aa2 == Ala))
		{
			num_Q++;
			continue;
		}

		if ((aa1 == Glu && aa2 == Gly) ||
			(aa1 == Gly && aa2 == Glu) ||
			(aa1 == Ala && aa2 == Asp) ||
			(aa1 == Asp && aa2 == Ala) ||
			(aa1 == Val && aa2 == Ser) ||
			(aa1 == Ser && aa2 == Val))
		{
			num_W++;
			continue;
		}
	}

	const int num_problematic = (num_W+num_Q+num_N);
	if (num_problematic>0)
		rbs.add_real_feature(f_idx,(float)num_problematic);
	f_idx++;

	if (num_W>0)
		rbs.add_real_feature(f_idx,(float)num_W);
	f_idx++;
	if (num_Q>0)
		rbs.add_real_feature(f_idx,(float)num_Q);
	f_idx++;
	if (num_N>0)
		rbs.add_real_feature(f_idx,(float)num_N);
	f_idx++;
	if (num_XG>0)
		rbs.add_real_feature(f_idx,(float)num_XG);
	f_idx++;

/*	feature_names.push_back("PEP COMP #double EG,GE,AD,DA,VS,SV");
	feature_names.push_back("PEP COMP #double ");
	feature_names.push_back("PEP COMP #double GG");
	feature_names.push_back("PEP COMP #double GA");
	feature_names.push_back("PEP COMP #double AG");
	feature_names.push_back("PEP COMP #double SL");*/

}

void DeNovoPartitionModel::fill_peak_offset_features(
								   Config *config,
								   const PeptideSolution& sol,
								   const vector< vector<mass_t> >& masses,
								   const vector< vector<intensity_t> >& intens,
								   RankBoostSample& rbs) const
{
	int f_idx = peak_offset_start_idx;

	vector<mass_t> break_masses;
	sol.pep.calc_expected_breakage_masses(config,break_masses);
	const int max_break = break_masses.size()-1;
	const mass_t pm_with_19 = sol.pm_with_19;

	int f;
	for (f=0; f<ppp_frag_type_idxs.size() && f<2; f++)
	{
		const int frag_idx = ppp_frag_type_idxs[f];
		const vector<intensity_t>& frag_intens = intens[frag_idx];
		const vector<mass_t>&	   frag_masses = masses[frag_idx];
		const FragmentType& fragment = config->get_fragment(frag_idx);
		vector<mass_t> exp_peak_masses;
		vector<int>	   inten_ranks, rank_positions;

		exp_peak_masses.resize(break_masses.size(),NEG_INF);
		find_inten_ranks(frag_intens, inten_ranks);
		rank_positions.resize(inten_ranks.size(),NEG_INF);
		int c;
		for (c=1; c<inten_ranks.size(); c++)
			if (frag_intens[c]>0)
				rank_positions[inten_ranks[c]]=c;

		// self offset features

		mass_t max_self_off=0;
		mass_t avg_self_off=0;
		int	   num_frags_detected=0;

		int b;
		for (b=1; b<=max_break; b++)
		{
			exp_peak_masses[b]=fragment.calc_expected_mass(break_masses[b],pm_with_19);
			if (frag_intens[b]<=0)
				continue;

			const float& frag_mass = frag_masses[b];
			const float& peak_mass = exp_peak_masses[b];
			mass_t offset=fabs(frag_masses[b]-exp_peak_masses[b]);

			if (offset>3.0)
			{
				cout << "Error: bad peak offset calculations!" << endl;
				exit(1);
			}
			avg_self_off+=offset;
			if (offset>max_self_off)
				max_self_off=offset;
			num_frags_detected++;
		}

		rbs.add_real_feature(f_idx++,num_frags_detected);

		if (num_frags_detected==0)
		{
			f_idx+=7;
			continue;
		}
		
		rbs.add_real_feature(f_idx++,max_self_off);
		if (num_frags_detected>0)
			rbs.add_real_feature(f_idx,(avg_self_off/num_frags_detected));
		f_idx++;
		
		if (num_frags_detected<5)
		{
			f_idx+=4;
			continue;
		}

		// between peak offsets
		int num_gaps=0;
		mass_t max_consec_offset=0;
		mass_t avg_consec_offset=0;

		for (b=1; b<exp_peak_masses.size(); b++)
			if (frag_intens[b]>0)
				break;
		int prev=b++;

		bool in_gap=false;
		int num_gap_offsets=0;
		for ( ; b<exp_peak_masses.size(); b++)
			if (frag_intens[b]<=0)
			{
				if (! in_gap)
				{
					num_gaps++;
					in_gap=true;
				}
			}
			else
			{
				in_gap=false;
				const mass_t offset=fabs(exp_peak_masses[b] -
										 exp_peak_masses[prev] -
										 frag_masses[b] + 
										 frag_masses[prev]);
				if (fabs(offset)>3.0)
				{
					
					cout << "Exp " << b << " " << exp_peak_masses[b] << endl;
					cout << "Exp " << prev << " " << exp_peak_masses[prev] << endl;
					cout << "Frag " << b << " " << frag_masses[b] << endl;
					cout << "Frag " << prev << " " << frag_masses[prev] << endl;
					exit(0);
				}



				if (offset>max_consec_offset)
					max_consec_offset=offset;
				avg_consec_offset+=offset;
				num_gap_offsets++;
			}

		rbs.add_real_feature(f_idx++,max_consec_offset);
		if (num_gap_offsets>0)
			rbs.add_real_feature(f_idx,avg_consec_offset/num_gap_offsets);
		f_idx++;
		
		// Peak grab features

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -