⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 peakrank_combined.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 4 页
字号:
		set_combined_simple_feature_names_in_rankboost_model(prank);
	}
	else
	{
		cout << "Error: no feature names supported for type " << prank->get_feature_set_type() << endl;
		exit(1);
	}
			
	// fill RankBoostSamples and create rank ds
	RankBoostDataset         rank_ds,test_ds;
	cout << "TRAINING..." << endl;
	cout << "Max weight ratio: " << max_weight_ratio << endl;

	// initialize and read the test set if it exists
	RankBoostDataset *test_set_ptr=NULL;
	if (test_set_file)
	{
		vector<float>			 test_peak_intens;
		vector<PeakStart>        test_peak_starts;
		vector<int>				 test_peak_frag_types;

		cout << endl << "Reading test tps..." << endl; 

		prank->read_training_peptides_into_combined_rank_boost_dataset(charge, size_idx, mobility,
			test_tps, test_ds, test_peak_intens, test_peak_starts, test_peak_frag_types);

		cout << "Test peak starts: " << test_peak_starts.size() << endl;
		cout << "Test peak intens: " << test_peak_intens.size() << endl;
		cout << "Test peak frags : " << test_peak_frag_types.size() << endl;

		cout << "Creating test phi list..." << endl;

		create_phi_list_from_combined_peak_samples(test_peak_intens, test_peak_frag_types,
			test_peak_starts,  max_frag_idx, test_ds.get_non_const_phi_support());

		test_ds.compute_total_phi_weight();
		test_set_ptr = &test_ds;

		// choose length (try shorte peptide if not eonough samples, go for the max)
		if (test_peptide_length == 0)
		{
			vector<int> test_length_counts;
			test_length_counts.resize(200,0);
			const vector<RankBoostSample>& samples = test_ds.get_samples();
			vector<int> sizes;
			sizes.resize(test_ds.get_num_groups(),0);
			int i;
			for (i=0; i<samples.size(); i++)
				sizes[samples[i].group_idx]=samples[i].tag3;

			for (i=1; i<sizes.size(); i++)
				test_length_counts[sizes[i]]++;
			
			int max=0;
			for (i=0; i<200; i++)
			{
				if (test_length_counts[i]>=500)
					break;
				if (test_length_counts[i]>test_length_counts[max])
					max=i;
			}

			if (i<200)
			{
				test_peptide_length = i;
			}
			else
				test_peptide_length = max;
		}
		cout << "test length " << test_peptide_length << endl << endl;
	}
		
	vector<float>		 train_peak_intens;
	vector<PeakStart>    train_peak_starts;
	vector<int>			 train_peak_frag_types;
	cout << endl << "reading training tps..." << endl;

	prank->read_training_peptides_into_combined_rank_boost_dataset(charge, size_idx, mobility,
		sample_tps, rank_ds, train_peak_intens, train_peak_starts, train_peak_frag_types);

	cout << "Train peak starts: " << train_peak_starts.size() << endl;
	cout << "Train peak intens: " << train_peak_intens.size() << endl;
	cout << "Train peak frags : " << train_peak_frag_types.size() << endl;

	cout << "create training phi list..." << endl;

	create_phi_list_from_combined_peak_samples(train_peak_intens, train_peak_frag_types,
			train_peak_starts,  max_frag_idx, rank_ds.get_non_const_phi_support());

	cout << "initializing boost..." << endl;
	rank_ds.compute_total_phi_weight();
	rank_ds.initialize_potenital_lists();
	rank_ds.initialize_binary_one_lists(combined_frag_boost_model.get_num_binary_features());
	rank_ds.initialize_binary_ordered_phi_lists(combined_frag_boost_model.get_ptr_to_binary_feature_names());
	rank_ds.initialzie_real_feature_table(combined_frag_boost_model.get_num_real_features());

	rank_ds.set_max_ratio_for_regular_update(max_weight_ratio);
				
	combined_frag_boost_model.init_rankboost_model_for_training(rank_ds,40,100);
		
	rank_ds.initialize_real_vote_lists(combined_frag_boost_model);

	char report_prefix[512];
	if (report_dir)
		sprintf(report_prefix,"%s/%s",report_dir, partition_name.c_str());
	
	if (report_prefix)
	{
		char name_buff[512];
		sprintf(name_buff,"%s_feature_summary.txt",report_prefix);
		ofstream sum_stream(name_buff);
		combined_frag_boost_model.summarize_features(rank_ds.get_samples(),sum_stream);
		sum_stream.close();
	}

	// write same info as the write_combined_model function performs
	vector<string> model_header_strings;
	ostringstream oss;
	oss << feature_set_type << " " << this->fragment_type_idxs.size();
	for (f=0; f<fragment_type_idxs.size(); f++)
		oss << " " << fragment_type_idxs[f];

	model_header_strings.push_back(oss.str());

	vector<idx_weight_pair> miss_pairs;
	combined_frag_boost_model.train_rankboost_model(rank_ds,
								max_num_rounds, 
								&miss_pairs, 
								test_set_ptr, 
								test_peptide_length, 
								report_prefix, 
								stop_signal_file,
								&model_header_strings);
		
	// final report
	if (report_dir)
	{
		char name_buff[512];
		
		sprintf(name_buff,"%s_feature_list.txt",report_prefix);
		ofstream feature_stream(name_buff);
		if (! feature_stream.is_open() || ! feature_stream.good())
		{
			cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
			exit(1);
		}

		
		cout << "[...";
		combined_frag_boost_model.ouput_importance_ranked_feature_list(rank_ds,feature_stream);
		cout << " ...]" << endl;
		feature_stream.close();

		// write model (also compresses features and deletes the default values)
		sprintf(name_buff,"%s_model.txt",report_prefix);
		ofstream model_stream(name_buff);
	
		
		vector<peak_rank_stat> dummy_stats;

		cout << "Before reduction error: " << setprecision(7) << 
			combined_frag_boost_model.calc_prediction_error(test_ds, dummy_stats, test_peptide_length) << endl;
		write_combined_partition_model(name_buff);
		cout << "After  reduction error: " << setprecision(7) << 
			combined_frag_boost_model.calc_prediction_error(test_ds, dummy_stats, test_peptide_length) << endl;
	}	
}



void PartitionModel::fill_combined_peak_features(
									const PeakRankModel *prank,
									const  vector<int>& org_amino_acids,
									const int    cut_idx,
									const mass_t cut_mass,
									const PeptideSolution& sol,
									const FragmentType& frag,
									const int   position_idx_in_model_fragment_type_idxs,
									RankBoostSample& sample) const
{
	const vector<string>& model_aa_labels = prank->get_model_aa_labels();
	const vector<int>& session_aas_to_model_aas = prank->get_session_aas_to_model_aas();
	const int length = org_amino_acids.size();
	const int num_aas = model_aa_labels.size();
	const mass_t pm_with_19 = sol.pm_with_19;
	const int spec_charge = sol.charge;

	int r_idx=0;
	int i;

	vector<int> amino_acids;
	prank->convert_aas_to_model_aas(org_amino_acids, amino_acids);

	if (amino_acids.size() != org_amino_acids.size())
	{
		cout << "Error: aa size mismatch!" << endl;
		exit(1);
	}

	if (cut_idx<=0 || cut_idx>=amino_acids.size())
	{
		cout << "Error: cut_idx is bad!" << endl;
		exit(1);
	}

	// need to use the special Idx variables and not the regular enumerations
	const int HisIdx = session_aas_to_model_aas[His];
	const int LysIdx = session_aas_to_model_aas[Lys];
	const int ArgIdx = session_aas_to_model_aas[Arg];
	const int SerIdx = session_aas_to_model_aas[Ser];
	const int ThrIdx = session_aas_to_model_aas[Thr];
	const int ProIdx = session_aas_to_model_aas[Pro];
	const int GlyIdx = session_aas_to_model_aas[Gly];
	const int AlaIdx = session_aas_to_model_aas[Ala];
	const int LeuIdx = session_aas_to_model_aas[Leu];
	const int AsnIdx = session_aas_to_model_aas[Asn];
	const int AspIdx = session_aas_to_model_aas[Asp];
	const int GluIdx = session_aas_to_model_aas[Glu];

	// special N C side aa indicators
	int num_nH=0, num_cH=0;
	int num_nK=0, num_cK=0;
	int num_nR=0, num_cR=0;
	
	for (i=0; i<cut_idx; i++)
	{
		if (amino_acids[i] == HisIdx)
			num_nH++;

		if (amino_acids[i] == LysIdx)
			num_nK++;

		if (amino_acids[i] == ArgIdx)
			num_nR++;
	}

	for (i=cut_idx; i<length; i++)
	{
		if (amino_acids[i] == HisIdx)
			num_cH++;

		if (amino_acids[i] == LysIdx)
			num_cK++;

		if (amino_acids[i] == ArgIdx)
			num_cR++;
	}

	// MASS / LOCATION FEATURES (REAL + BINARY)
	const mass_t max_detected_mass = prank->get_max_detected_mass();
	const mass_t exp_peak_mass = frag.calc_expected_mass(cut_mass,pm_with_19);
	const mass_t min_obs_mass = prank->calc_min_detected_mass(pm_with_19,spec_charge);
	const mass_t max_obs_mass = (pm_with_19>max_detected_mass ? max_detected_mass : pm_with_19);
	
	const float peak_mass_prop = ((exp_peak_mass - min_obs_mass)/(max_obs_mass - min_obs_mass));
	const float rounded_peak_prop = 0.02*floor(peak_mass_prop * 50.0);

	const mass_t dis_from_min = 20.0*floor((exp_peak_mass - min_obs_mass)*0.05);
	const mass_t dis_from_max = 20.0*floor((max_obs_mass  - exp_peak_mass)*0.05);
	const mass_t dis_from_minmax = (dis_from_min<dis_from_max ? dis_from_min : prank->get_max_detected_mass() - dis_from_max);

	const int RKH_n_combo_idx = calc_RKH_combo_idx(num_nR,num_nK,num_nH);
	const int RKH_c_combo_idx = calc_RKH_combo_idx(num_cR,num_cK,num_cH);
	const int RKH_pair_idx = (RKH_n_combo_idx * num_RKH_combos) + RKH_c_combo_idx;
	const float RKH_liniar_pair_idx = RKH_pair_matrix[RKH_n_combo_idx][RKH_c_combo_idx];

	const float cut_prop = 0.02 * floor((cut_mass / pm_with_19)*50);
	const float cut_n_mass = 20 * floor(cut_mass*0.05);
	const float cut_c_mass = 20 * floor((pm_with_19-cut_mass)*0.05);
	const int cut_dis_from_n = cut_idx;
	const int cut_dis_from_c = length-cut_idx;

	const int third=length/3;
	const int two_thirds = length - third;
	float cut_pos=NEG_INF;
	if (cut_idx<=4 || (cut_idx<=third && cut_idx<8))
	{
		cut_pos = (float)cut_idx;
	}
	else if (cut_idx>=length-4 || (cut_idx>=two_thirds && cut_idx>length-8))
	{
		cut_pos = (float)(20-length+cut_idx);
	}
	else
		cut_pos = 10.0 + cut_prop;

	const int n_aa = amino_acids[cut_idx-1];
	const int c_aa = amino_acids[cut_idx];
	const int n_term_aa = amino_acids[0];
	const int c_term_aa = amino_acids[length-1];


	sample.clear();

	// first feature is always 0, tells what fragment this is
	sample.add_real_feature(0,(float)position_idx_in_model_fragment_type_idxs);

	// this is the new starting index for features in this sample. The offset
	// is computaed according to the fragment type
	int f_idx =1 + position_idx_in_model_fragment_type_idxs * num_features_per_frag; // num_features_per_frag_type

	if (num_features_per_frag==0)
	{
		cout << "Error: num_features_per_frag is 0!" << endl;
		exit(1);
	}

	// add general position features
	sample.add_real_feature(f_idx++,dis_from_minmax);
	sample.add_real_feature(f_idx++,rounded_peak_prop);
	sample.add_real_feature(f_idx++,RKH_pair_idx);
	sample.add_real_feature(f_idx++,RKH_liniar_pair_idx);
	sample.add_real_feature(f_idx++,cut_prop);
	sample.add_real_feature(f_idx++,cut_pos);
	sample.add_real_feature(f_idx++,cut_n_mass);
	sample.add_real_feature(f_idx++,cut_c_mass);
	sample.add_real_feature(f_idx++,(float)cut_dis_from_n);
	sample.add_real_feature(f_idx++,(float)cut_dis_from_c);
	
	const float position_var = rounded_peak_prop; // this will be used for all amino acid features

	// fill aa flanking features N side
	if (cut_idx>0)
		sample.add_real_feature(f_idx+amino_acids[cut_idx-1],position_var);
	f_idx+=num_aas;
	if (cut_idx>1)
		sample.add_real_feature(f_idx+amino_acids[cut_idx-2],position_var);
	f_idx+=num_aas;
	if (cut_idx>2)
		sample.add_real_feature(f_idx+amino_acids[cut_idx-3],position_var);
	f_idx+=num_aas;

	// fill aa flanking features C side
	if (cut_idx<length)
		sample.add_real_feature(f_idx+amino_acids[cut_idx],position_var);
	f_idx+=num_aas;
	if (cut_idx<length-1)
		sample.add_real_feature(f_idx+amino_acids[cut_idx+1],position_var);
	f_idx+=num_aas;
	if (cut_idx<length-2)
		sample.add_real_feature(f_idx+amino_acids[cut_idx+2],position_var);
	f_idx+=num_aas;

	// fill cut pair features X-Y
	sample.add_real_feature(f_idx+(n_aa*num_aas+c_aa),position_var);
	f_idx+=(num_aas*num_aas);

	// fill aa count features (up top 3 aa's away from cut)
	vector<int> n_aa_counts, c_aa_counts;
	n_aa_counts.resize(num_aas+1,0);
	c_aa_counts.resize(num_aas+1,0);

	for (i=0; i<cut_idx-3; i++)
		n_aa_counts[amino_acids[i]]++;

	for (i=cut_idx+3; i<length; i++)
		c_aa_counts[amino_acids[i]]++;

	int a;
	for (a=0; a<num_aas; a++)
		if (n_aa_counts[a]>0)
			sample.add_real_feature(f_idx+a,n_aa_counts[a]);
	f_idx+=num_aas;

	for (a=0; a<num_aas; a++)
		if (c_aa_counts[a]>0)
			sample.add_real_feature(f_idx+a,c_aa_counts[a]);
	f_idx+=num_aas;

	// add Cut\terminal features
	sample.add_real_feature(f_idx+n_term_aa,cut_pos);
	f_idx+=num_aas;
	sample.add_real_feature(f_idx+c_term_aa,cut_pos);
	f_idx+=num_aas;

	// X|
	if (cut_idx>0)
	{
		sample.add_real_feature(f_idx+amino_acids[cut_idx-1],cut_pos);
		f_idx+=num_aas;
		if (c_term_aa == LysIdx)
			sample.add_real_feature(f_idx+amino_acids[cut_idx-1],cut_pos);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -