📄 denovopartmodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
		int i;
		for (i=0; i<amino_acids.size(); i++)
			if (amino_acids[i] == m16_aa_idx)
			{
				float total_inten=0;
				int f;
				for (f=0; f<ppp_fragments.size(); f++)
				{
					if (ppp_fragments[f].orientation == PREFIX)
					{
						const float& inten = intens[ppp_frag_type_idxs[f]][i+1];
						if (inten>0)
							total_inten+= inten;
					}
					else
					{
						const float& inten = intens[ppp_frag_type_idxs[f]][i];
						if (inten>0)
							total_inten+= inten;
					}
				}
				pairs.push_back(score_pair(i,total_inten));
			}
		sort(pairs.begin(),pairs.end());
		vector<mass_t> break_masses;
		sol.pep.calc_expected_breakage_masses(config,break_masses);
		if (pairs.size()>0)
		{
			const int num_aa = amino_acids.size();
			for (i=0; i<pairs.size() && i<MAX_NUM_M16s; i++)
			{
				const int aa_idx=pairs[i].idx;

				if (intens[b_frag_idx].size()>aa_idx+1)
				{
					const float b_inten = intens[b_frag_idx][aa_idx+1];
					const int pre_minus_63_idx = as.get_max_inten_peak(break_masses[aa_idx+1]-63.0,tolerance);
					
					float ratio_pre_minus_63=-10.0;
					if (b_inten == 0)
					{
						if (pre_minus_63_idx>=0)
							ratio_pre_minus_63 = 11.0;
					}
					else
					{
						ratio_pre_minus_63 = ( pre_minus_63_idx<0 ? 0 : as.get_peak_intensity(pre_minus_63_idx)/b_inten);
						if (ratio_pre_minus_63>10.0)
							ratio_pre_minus_63=10.0;
					}
					rbs.add_real_feature(f_idx+i*2,ratio_pre_minus_63);
				}

				if (intens[y_frag_idx].size()>aa_idx)
				{
					const mass_t pep_mass = sol.pep.get_mass();
					const float y_inten = intens[y_frag_idx][aa_idx];
					const int suf_minus_45_idx = as.get_max_inten_peak((pep_mass - break_masses[aa_idx])-45.0,tolerance);
				
					float ratio_suf_minus_45=-10;
				
					if (y_inten == 0)
					{
						if (suf_minus_45_idx>=0)
							ratio_suf_minus_45 = 11.0;
					}
					else
					{
						ratio_suf_minus_45 = ( suf_minus_45_idx<0 ? 0 : as.get_peak_intensity(suf_minus_45_idx)/y_inten);
						if (ratio_suf_minus_45>10.0)
							ratio_suf_minus_45=10.0;
	
					}
					rbs.add_real_feature(f_idx+i*2+1,ratio_suf_minus_45);
				}
			}
		}
	}

	f_idx += 4;
}



void DeNovoRankScorer::init_tables(bool silent_ind)
{
	PeakRankModel *& peak_model = peak_prediction_models[model_type];

	if (! model || ! model->get_ind_pmcsqs_was_intialized())
	{
		cout << "Error: must first initialize the fragment model!" << endl;
		exit(1);
	}

	if (! peak_model || peak_model->get_size_thresholds().size()<=0)
	{
		cout << "Error: must first initialize the peak model!" << endl;
		exit(1);
	}

	const vector< vector< mass_t> >& size_thresholds = peak_model->get_size_thresholds();

	this->dnv_part_models.resize(size_thresholds.size());
	int i;

	if (! silent_ind)
		cout << "Init tables:" << endl;

	for (i=1; i<dnv_part_models.size(); i++)
	{
		dnv_part_models[i].resize(size_thresholds[i].size()+1,NULL);

		if (! silent_ind)
			cout << i << "\t" << size_thresholds[i].size()+1 << endl;
	}
}


void DeNovoPartitionModel::init_features(int model_type, int _charge, int _size_idx,
							const vector<int>& ppp_frags, Config *config)
{
	charge = _charge;
	size_idx = _size_idx;

	ScalingFactor def_scale;
	def_scale.max_pm_with_19 = POS_INF;
	def_scale.score_shift = 0;
	def_scale.score_scale = 1.0;
	scaling_factors.clear();
	scaling_factors.push_back(def_scale);

	feature_names.clear();

	ind_was_initialized = true;

	use_PTM_peak_features	   = true;
	use_tryp_terminal_features = true;
	use_ann_peak_features = true;
	use_inten_balance_features = (model_type != 3);
	use_peak_offset_features = true;
	use_comp_features = true;
	use_pmc_features = (model_type != 3);
	use_prm_features = true;
	use_ppp_features = false;
	use_combined_ppp_features = true;

	if (use_combined_ppp_features && use_ppp_features)
	{
		cout << "Error: must choose combine or regular ppp features, not both!" << endl;
		exit(1);
	}

	num_ppp_frags=ppp_frags.size();
	ppp_frag_type_idxs=ppp_frags;
	ppp_fragments.resize(ppp_frag_type_idxs.size());
	int f;
	for (f=0; f<ppp_frag_type_idxs.size(); f++)
		ppp_fragments[f] = config->get_fragment(ppp_frag_type_idxs[f]);


	// features for special PTM peaks (like the ones for M+16)
	PTM_peak_start_idx = feature_names.size();
	if (use_PTM_peak_features)
	{
		feature_names.push_back("M+16 first p-63 ratio");
		feature_names.push_back("M+16 first s-45 ratio");
		feature_names.push_back("M+16 second p-63 ratio");
		feature_names.push_back("M+16 second s-45 ratio");
	}

	// tryptic terminal features
	tryp_terminal_start_idx = feature_names.size();
	if (use_tryp_terminal_features)
	{
		feature_names.push_back("TRYP #num good tryp terminals");
		feature_names.push_back("TRYP #num missed tryp terminals");
		feature_names.push_back("TRYP C-term AA");
		feature_names.push_back("TRYP #frags at digest when C-term is R");
		feature_names.push_back("TRYP #frags at digest when C-term is K");
		feature_names.push_back("TRYP #frags at digest when C-term is other");
		feature_names.push_back("TRYP AA at N-terminal When C-term is R");
		feature_names.push_back("TRYP AA at N-terminal When C-term is K");
		feature_names.push_back("TRYP AA at N-terminal When C-term is other");
	}

	// Ann peak features
	ann_peak_start_idx = feature_names.size();
	if (use_ann_peak_features)
	{
		feature_names.push_back("ANN PEAK diff from org pm_with_19");
		feature_names.push_back("ANN PEAK # aas in peptide");
		feature_names.push_back("ANN PEAK %ann intensity");
		feature_names.push_back("ANN PEAK %ann peaks");
		feature_names.push_back("ANN PEAK #ann in top 25");
		feature_names.push_back("ANN PEAK #ann in top half (up to 50)");
		feature_names.push_back("ANN PEAK #ann in top third - #ann in mid third");
		feature_names.push_back("ANN PEAK #ann in top third - #ann in last third");
		feature_names.push_back("ANN PEAK #ann in mid third - #ann in last third");

		const vector<FragmentType>& all_fragments = config->get_all_fragments();
		int f;
		for (f=0; f<all_fragments.size() && f<7; f++)
		{
			const string frag_label = all_fragments[f].label;
			feature_names.push_back( "ANN PEAK #" + frag_label + " annotated");
		}
	}

	// inten balance features
	inten_balance_start_idx = feature_names.size();
	if (use_inten_balance_features)
	{
		feature_names.push_back("INTEN BAL c_idx - n_idx");
		feature_names.push_back("INTEN BAL RHK N");
		feature_names.push_back("INTEN BAL RHK C");
		feature_names.push_back("INTEN BAL RHK pair");
		feature_names.push_back("INTEN BAL prefix prop, pair -4,-5");
		feature_names.push_back("INTEN BAL prefix prop, pair -2,-3");
		feature_names.push_back("INTEN BAL prefix prop, pair -1,0,+1");
		feature_names.push_back("INTEN BAL prefix prop, pair +2,+3");
		feature_names.push_back("INTEN BAL prefix prop, pair +4,+5");

		feature_names.push_back("INTEN BAL all prefix prop, pair -4,-5");
		feature_names.push_back("INTEN BAL all prefix prop, pair -2,-3");
		feature_names.push_back("INTEN BAL all prefix prop, pair -1,0,+1");
		feature_names.push_back("INTEN BAL all prefix prop, pair +2,+3");
		feature_names.push_back("INTEN BAL all prefix prop, pair +4,+5");
	}

	// Peak offset features
	peak_offset_start_idx=feature_names.size();
	if (use_peak_offset_features)
	{
		int f;
		for (f=0; f<ppp_frag_type_idxs.size() && f<2; f++)
		{
			const int frag_idx = ppp_frag_type_idxs[f];
			const string frag_label = config->get_fragment(frag_idx).label;
			const string prefix = "PEAK OFF " + frag_label + " ";

			feature_names.push_back(prefix+"num frags detected");
			feature_names.push_back(prefix+"max self offset");
			feature_names.push_back(prefix+"avg self offset");

			feature_names.push_back(prefix+"max consecutive offset");
			feature_names.push_back(prefix+"avg consecutive offset");

			// peak grab feature
			feature_names.push_back(prefix+"grab offset #1");
			feature_names.push_back(prefix+"grab offset #2");
			feature_names.push_back(prefix+"grab offset #3"); 
		}

		if (f<2)
		{
			int i;
			for (i=0; i<8; i++)
				feature_names.push_back("PEAK OFF dummy");
		}
	}

	// Peptide composition features
	comp_start_idx=feature_names.size();
	if (use_comp_features)
	{
		feature_names.push_back("PEP COMP start cat N (len 3)");
		feature_names.push_back("PEP COMP end cat C (len 3)");
		feature_names.push_back("PEP COMP len 3 # cat 19-20");
		feature_names.push_back("PEP COMP len 3 # cat 15-18");
		feature_names.push_back("PEP COMP len 3 # cat 7-14");
		feature_names.push_back("PEP COMP len 3 # cat 3-6");
		feature_names.push_back("PEP COMP len 3 # cat 1-2");
		feature_names.push_back("PEP COMP min cat, len 3");
		feature_names.push_back("PEP COMP avg cat, len 3");	
		feature_names.push_back("PEP COMP before cat score 1");
		feature_names.push_back("PEP COMP after cat score 1");
		feature_names.push_back("PEP COMP span cat score 1");
		feature_names.push_back("PEP COMP before cat score 2");
		feature_names.push_back("PEP COMP after cat score 2");
		feature_names.push_back("PEP COMP span cat score 2");
		feature_names.push_back("PEP COMP before cat score 3");
		feature_names.push_back("PEP COMP after cat score 3");
		feature_names.push_back("PEP COMP span cat score 3");
		feature_names.push_back("PEP COMP before cat score 4");
		feature_names.push_back("PEP COMP after cat score 4");
		feature_names.push_back("PEP COMP span cat score 4");

		const vector<string>& aa2label = config->get_aa2label();
		int a;
		for (a=Ala; a<=Val; a++)
		{
			if (a==Ile)
				continue;
			feature_names.push_back("PEP COMP #aa " + aa2label[a]);	
		}

		feature_names.push_back("PEP COMP #problematic double combos");
		feature_names.push_back("PEP COMP #double combo=W");
		feature_names.push_back("PEP COMP #double combo=Q");
		feature_names.push_back("PEP COMP #double combo=N");
		feature_names.push_back("PEP COMP #double problematic combos with XG");
	}

	// PMCSQS features
	pmc_start_idx=feature_names.size();
	if (use_pmc_features)
	{
		feature_names.push_back("PMCSQS sqs prob for peptide charge");
		feature_names.push_back("PMCSQS prob for peptide charge");
		feature_names.push_back("PMCSQS mass diff from pm1, prob>0.95");
		feature_names.push_back("PMCSQS mass diff from pm1, prob<=0.95");
		feature_names.push_back("PMCSQS score1 for peptide charge");
		feature_names.push_back("PMCSQS score2 for peptide charge");
		feature_names.push_back("PMCSQS mass diff from pm2");
		feature_names.push_back("PMCSQS max  prob for other charges");
		feature_names.push_back("PMCSQS score diff from max score with this charge, prob>=0.95");
		feature_names.push_back("PMCSQS score diff from max score with this charge, 0.95>prob>=0.7");
		feature_names.push_back("PMCSQS score diff from max score with this charge, prob<0.7");
	}

	prm_start_idx = feature_names.size();
	if (use_prm_features) // use these feature only with de novo
	{
		const string term_combo[4]={"N/C","N/-C","-N/C","-N/-C"};
		int i;
		for (i=0; i<4; i++)
		{
			if (i>0 && (model_type != 1 && model_type != 3))
				break;

			feature_names.push_back("PRM " + term_combo[i] + " delta mass");
			feature_names.push_back("PRM " + term_combo[i] + " total breakage score");
			feature_names.push_back("PRM " + term_combo[i] + " average breakage score");
			feature_names.push_back("PRM " + term_combo[i] + " normalized average breakage score");
			feature_names.push_back("PRM " + term_combo[i] + " path score");
			feature_names.push_back("PRM " + term_combo[i] + " average path score");
		}

		// adjust to random prob in spectrum (so model can be used with FT)
		if (model_type == 1 || model_type == 3)
		{
			feature_names.push_back("PRM path score");
			feature_names.push_back("PRM total breakage score");
			feature_names.push_back("PRM SeqPath rank");
			feature_names.push_back("PRM multipath score");

			feature_names.push_back("PRM delta score");
			feature_names.push_back("PRM rank, delta score<=1.5");
			feature_names.push_back("PRM rank, 1.5<delta score<=7.5");
			feature_names.push_back("PRM rank, 7.5<delta score<=15");
			feature_names.push_back("PRM rank, delta score>15");

			if (model_type == 3)
			{
				feature_names.push_back("PRM tag, percent in top 5 denovo");
				feature_names.push_back("PRM tag, percent in top 20 denovo");
				feature_names.push_back("PRM tag, percent in all denovo");
				feature_names.push_back("PRM tag, rank if in top 5");
				feature_names.push_back("PRM tag, rank if in top 5-20");
				feature_names.push_back("PRM tag, rank if in top 20-all");
				feature_names.push_back("PRM tag, highest full denovo rank");
			}
		}

		feature_names.push_back("PRM delta num breakage scores (missing)");
	//	feature_names.push_back("PRM num missing edges");
		feature_names.push_back("PRM num forbidden node pairs");
		feature_names.push_back("PRM num breakage scores");
		feature_names.push_back("PRM breakage score min 1");
		feature_names.push_back("PRM breakage score min 2");
		feature_names.push_back("PRM breakage score min 3");
		feature_names.push_back("PRM breakage score min consecutive 3");
		feature_names.push_back("PRM breakage score max consecutive 3");
		feature_names.push_back("PRM breakage score min consecutive 2");
		feature_names.push_back("PRM breakage score max consecutive 2");
		feature_names.push_back("PRM #breakage scores below -10"); 
		feature_names.push_back("PRM #breakage scores 0 - -10");
		feature_names.push_back("PRM #breakage scores 0 - 8");
		feature_names.push_back("PRM #breakage scores 8 - 15");
		feature_names.push_back("PRM #breakage scores above 15");
		feature_names.push_back("PRM %breakage scores below -10"); 
		feature_names.push_back("PRM %breakage scores below 0");
		feature_names.push_back("PRM %breakage scores above 0");
		feature_names.push_back("PRM %breakage scores above 8");
		feature_names.push_back("PRM Score connected to N-terminal");
		feature_names.push_back("PRM Score connected to C-terminal");

		feature_names.push_back("PRM %breakages with 1 frag detected");
		feature_names.push_back("PRM %breakages with 2 frag detected");
		feature_names.push_back("PRM %breakages with > 5 frags detected");
		feature_names.push_back("PRM %breakages with dual orientation frags");
		feature_names.push_back("PRM #orientation switches");
	}
	
	// Peak prediction features
	ppp_start_idx=feature_names.size();
	if (use_ppp_features)
	{
		int f;
		for (f=0; f<ppp_frag_type_idxs.size(); f++)
		{
			const int frag_idx = ppp_frag_type_idxs[f];
			const string frag_label = config->get_fragment(frag_idx).label;
			const string prefix = "PPP " + frag_label + " ";
			feature_names.push_back(prefix+"# observed frags");
			feature_names.push_back(prefix+"# predicted frags");
			feature_names.push_back(prefix+"observation ratio");
			feature_names.push_back(prefix+"# observed frags in top 1 predicted");
			feature_names.push_back(prefix+"# observed frags in top 3 predicted");
			feature_names.push_back(prefix+"# observed frags in top 5 predicted");
			feature_names.push_back(prefix+"# observed frags in top 7 predicted");
			feature_names.push_back(prefix+"% observed frags in top 1/6 predicted");
			feature_names.push_back(prefix+"% observed frags in top 1/3 predicted");
			feature_names.push_back(prefix+"% observed frags in top 1/2 predicted");
			feature_names.push_back(prefix+"% observed frags in top 2/3 predicted");
			feature_names.push_back(prefix+" predicted rank of first missing peak");
			feature_names.push_back(prefix+" predicted rank of second missing peak");
			feature_names.push_back(prefix+" predicted rank of third missing peak");
			feature_names.push_back(prefix+" predicted rank of first+second missing peak");
			feature_names.push_back(prefix+" predicted rank of first+second+third missing peak");
			feature_names.push_back(prefix+"score offset of rank 1");
			feature_names.push_back(prefix+"score offset of rank 2");
			feature_names.push_back(prefix+"score offset of rank 3");
			feature_names.push_back(prefix+"score offset of rank 4");
			feature_names.push_back(prefix+"score offset of rank 5");
			feature_names.push_back(prefix+"score offset of rank 6");
			feature_names.push_back(prefix+"score offset of rank 7");
			feature_names.push_back(prefix+"score offset of rank 8");
			feature_names.push_back(prefix+"score offset of rank 9");
			feature_names.push_back(prefix+"score offset of rank 10");
		}
	
		feature_names.push_back("PPP #comp pairs");
		feature_names.push_back("PPP stat of predicted pair #1");
		feature_names.push_back("PPP stat of predicted pair #2");
		feature_names.push_back("PPP stat of predicted pair #3");
		feature_names.push_back("PPP stat of predicted pair #4");
		feature_names.push_back("PPP stat of predicted pair #5");
		feature_names.push_back("PPP stat of predicted pair #6");
		feature_names.push_back("PPP stat of predicted pair #7");
	}

	combined_ppp_start_idx = feature_names.size();
	if (use_combined_ppp_features)
	{
		feature_names.push_back("COMB PPP mobility");
		feature_names.push_back("COMP PPP frag 1 obs_ratio");
		feature_names.push_back("COMP PPP frag 2 obs_ratio");
		feature_names.push_back("COMP PPP frag 3 obs_ratio");
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -