⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 partitionmodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 2 页
字号:
			if (test_peptide_length == 0)
			{
				vector<int> test_length_counts;
				test_length_counts.resize(200,0);
				const vector<RankBoostSample>& samples = test_ds.get_samples();
				vector<int> sizes;
				sizes.resize(test_ds.get_num_groups(),0);
				int i;
				for (i=0; i<samples.size(); i++)
					sizes[samples[i].group_idx]=samples[i].tag3;

				for (i=1; i<sizes.size(); i++)
					test_length_counts[sizes[i]]++;
				
				int max=0;
				for (i=0; i<200; i++)
				{
					if (test_length_counts[i]>=500)
						break;
					if (test_length_counts[i]>test_length_counts[max])
						max=i;
				}

				if (i<200)
				{
					test_peptide_length = i;
				}
				else
					test_peptide_length = max;
			}
			cout << "test length " << test_peptide_length << endl;
		}
	
		
		cout << "read training tps..." << endl;

		prank->read_training_peptides_into_rank_boost_dataset(frag_idx, charge,  
			sample_tps, rank_ds, peak_intens, peak_starts, max_annotated_intens);

		RankBoostModel& boost = frag_models[f];
		boost.init_rankboost_model_feature_names(prank->get_binary_names(),prank->get_real_names());

		cout << "create training phi list..." << endl;

		create_phi_list_from_samples(peak_intens,peak_starts, max_annotated_intens, 
			rank_ds.get_non_const_phi_support());

		cout << "initializing boost..." << endl;

		rank_ds.compute_total_phi_weight();
		rank_ds.initialize_potenital_lists();
		rank_ds.initialize_binary_one_lists(prank->get_binary_names().size());
		rank_ds.initialize_binary_ordered_phi_lists(boost.get_ptr_to_binary_feature_names());
		rank_ds.initialzie_real_feature_table(prank->get_real_names().size());

		rank_ds.set_max_ratio_for_regular_update(max_weight_ratio);
				
		boost.init_rankboost_model_for_training(rank_ds,40,100);
		
		rank_ds.initialize_real_vote_lists(boost);

	//	boost.summarize_features(rank_ds.get_samples());

		char report_prefix[512];
		if (report_dir)
			sprintf(report_prefix,"%s/%s_%d",report_dir, partition_name.c_str(),frag_idx);

		vector<idx_weight_pair> miss_pairs;

		boost.train_rankboost_model(rank_ds,
									max_num_rounds, 
									&miss_pairs, 
									test_set_ptr, 
									test_peptide_length, 
									report_prefix, 
									stop_signal_file);

		

		// final report
		if (report_dir)
		{
			char name_buff[512];

		
			sprintf(name_buff,"%s_train_miss_pairs.txt",report_prefix);
			ofstream report_stream(name_buff);
			if (! report_stream.is_open() || ! report_stream.good())
			{
				cout << "Error: couldn't open pairs report file for writing:" << name_buff << endl;
				exit(1);
			}

			simple_print_peak_pairs(miss_pairs, sample_tps, rank_ds, prank, frag_idx, 250, report_stream);
			report_stream.close();


			sprintf(name_buff,"%s_feature_list.txt",report_prefix);
			ofstream feature_stream(name_buff);
			if (! feature_stream.is_open() || ! feature_stream.good())
			{
				cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
				exit(1);
			}
			cout << "[...";
			boost.ouput_importance_ranked_feature_list(rank_ds,feature_stream);
			cout << " ...]" << endl;
			feature_stream.close();


			// write model (also compresses features and deletes the default values)
			sprintf(name_buff,"%s_model.txt",report_prefix);
			ofstream model_stream(name_buff);
			boost.write_rankboost_model(model_stream,true);
			model_stream.close();
		}	
		else // send to cout
			simple_print_peak_pairs(miss_pairs, sample_tps, rank_ds, prank, frag_idx, 100);
	}
}



void  PartitionModel::set_partition_name(const string& peak_rank_model_name, 
									     int charge, int size_idx, int mobility)
{
	char buff[256];
	
	sprintf(buff,"%s_%d_%d_%d",peak_rank_model_name.c_str(),charge,size_idx,mobility);

	partition_name  = string(buff);
}



/*************************************************************************
**************************************************************************/
void PartitionModel::simple_print_peak_pairs(
						  const vector<idx_weight_pair>& pair_idxs, 
						  const vector<TrainingPeptide>& tps,
						  const RankBoostDataset& ds,
						  PeakRankModel *prank,
						  int frag,
						  int max_examples,
						  ostream& os) const
{
	const int max_examples_for_tp = 2;
	const vector<SamplePairWeight>& phi_support= ds.get_phi_support();
	vector<int> tp_idx_counts;
	tp_idx_counts.clear();

	if (this->feature_set_type > 2)
	{
		cout << "Error: this funcion is not designed for feature set " << feature_set_type << endl;
		exit(1);
	}

	int f;
	for (f=0; f<this->fragment_type_idxs.size(); f++)
		if (fragment_type_idxs[f]==frag)
			break;
	if (f== fragment_type_idxs.size())
	{
		cout << "Error: partition does not have an initialized model for frag " << frag << endl;
		exit(1);
	}
	const int frag_pos=f;

	int p_idx;
	int counter = 0;
	for (p_idx=0; p_idx<pair_idxs.size(); p_idx++)
	{
		const int pair_idx = pair_idxs[p_idx].idx;
		const int x0_idx = phi_support[pair_idx].idx1;
		const int x1_idx = phi_support[pair_idx].idx2;
		const RankBoostSample sam_x0 = ds.get_sample(x0_idx);
		const RankBoostSample sam_x1 = ds.get_sample(x1_idx);
		const int tp_idx = sam_x0.group_idx;
		const int x0_cut_idx = sam_x0.tag2;
		const int x1_cut_idx = sam_x1.tag2;
		const TrainingPeptide& tp = tps[tp_idx];
		const RankBoostModel& boost_model = frag_models[frag_pos];

		const int num_binary_features = boost_model.get_num_binary_features();
		const int num_real_features   = boost_model.get_num_real_features();
		const vector<string>& binary_feature_names = boost_model.get_binary_feature_names();
		const vector<string>& real_feature_names   = boost_model.get_real_feature_names();
		const vector<string>& model_aa_labels = prank->get_model_aa_labels();

		const vector<int>& bin_updates  = boost_model.get_binary_update_counts();
		const vector<int>& real_updates = boost_model.get_real_update_counts();

		int j;
		for (j=0; j<tp_idx_counts.size(); j++)
			if (tp_idx_counts[j]==tp_idx)
				break;

		if (j==tp_idx_counts.size())
		{
			tp_idx_counts.push_back(tp_idx);
		}
		else
		{
			if (tp_idx_counts[j]>=max_examples_for_tp)
				continue;
			tp_idx_counts[j]++;
		}

		counter++;

		vector<int> amino_acids;
		prank->convert_aas_to_model_aas(tp.amino_acids,amino_acids);

		if (sam_x0.group_idx <0 || sam_x0.tag2<0 || sam_x1.group_idx <0 || sam_x1.tag2<0)
		{
			cout << "Error: bad tags attached to samples:" << endl;
			cout << sam_x0.group_idx << " " << sam_x0.tag1 << " " << sam_x0.tag2 << endl;
			cout << sam_x1.group_idx << " " << sam_x1.tag1 << " " << sam_x1.tag2 << endl;
			exit(1);
		}
		
		if (sam_x0.group_idx != sam_x1.group_idx)
		{
			cout << "Error: group_idx mismathces!" << endl;
			exit(1);
		}

		const int pos = tp.get_frag_idx_pos(frag);
		if (pos<0)
		{
			cout << "Error: frag not in tp intens!" << endl;
			exit(1);
		}
		const vector<float>& intens = tp.intens[pos];

	
		os << counter << "\t" << phi_support[pair_idx].weight << "\t" << pair_idxs[p_idx].weight << "\t";

		int a;
		for (a=0; a<amino_acids.size(); a++)
		{
			if (a == x0_cut_idx)
				os << " 0 ";

			if (a == x1_cut_idx)
				os << " 1 ";

			os << model_aa_labels[amino_acids[a]];
		}
		os << endl;



		os << "x0: cut " << x0_cut_idx << "  inten " << intens[x0_cut_idx] << "\t score " <<
			boost_model.calc_rank_score(sam_x0) << endl;

		os << "x1: cut " << x1_cut_idx << "  inten " << intens[x1_cut_idx] << "\t score " <<
			boost_model.calc_rank_score(sam_x1) << endl;

		os << endl << endl;

		if (max_examples>0 && counter>=max_examples)
			break;
	}
}


void   PartitionModel::print_model_stats() const
{
	if (this->feature_set_type <=2)
	{
		cout << this->partition_name << " \t";
		int i;
		for (i=0; i<10; i++)
		{
			int j;
			for (j=0; j<this->fragment_type_idxs.size(); j++)
				if (fragment_type_idxs[j]==i && frag_models[j].get_ind_was_initialized())
				{
					cout << " *";
					break;
				}
			if (j==fragment_type_idxs.size())
				cout << "  ";
		}
		cout << endl;
	}
	else
	{
		cout << this->partition_name << " \t";
		int i;
		for (i=0; i<10; i++)
		{
			int j;
			for (j=0; j<this->fragment_type_idxs.size(); j++)
				if (fragment_type_idxs[j]==i && this->combined_frag_boost_model.get_ind_was_initialized())
				{
					cout << " *";
					break;
				}
			if (j==fragment_type_idxs.size())
				cout << "  ";
		}
		cout << endl;
	}
}

/***********************************************************************
makes tables listing features and final scores
Only makes table if the predictions match
************************************************************************/
bool PeakRankModel::make_peak_prediction_table(
			const PeptideSolution& sol,
			const vector< vector<intensity_t> >& intens,
			int num_peaks) const
{
	PeptidePeakPrediction ppp;
	calc_peptide_predicted_scores(sol, ppp);

	// the ppp includes a table of rank scores (rows are actual frag idxs, not relative
	// position in the frag_type_idxs).

	// reduce intensities to the same dimensionality
	const int num_frags = ppp.frag_idxs.size();
	vector< vector< float> > observed_intens;
	observed_intens.resize(num_frags);

	int i,f;
	for (f=0; f<num_frags; f++)
	{
		const int frag_idx = ppp.frag_idxs[f];
		observed_intens[f]=intens[frag_idx]; 
	}

	// calculate the ranks and mapping between predicted and observed
	vector< vector<int> > observed_ranks, predicted_ranks;
	calc_combined_peak_ranks(observed_intens, observed_ranks);
	calc_combined_peak_ranks(ppp.rank_scores, predicted_ranks);

	vector<int> sel_frags, sel_idxs;
	vector< float > intensities;
	
	
	int rank;
	for (rank=0; rank<num_peaks; rank++)
	{
		bool good_pred=false;
		for (f=0; f<num_frags; f++)
		{
			int i;
			for (i=0; i<predicted_ranks[f].size(); i++)
			{
				if (predicted_ranks[f][i] == rank &&
					observed_ranks[f][i]  == rank)
				{
					good_pred=true;
					sel_frags.push_back(f);
					sel_idxs.push_back(i);
					intensities.push_back(intens[f][i]);
					break;
				}
			}
		}
		if (! good_pred)
			return false;
	}

//	cout << "#sel_frags: " << sel_frags.size() << endl;
	

	// calc specific peak vectors and collect data
	vector< vector< string> > feature_names;
	vector< vector< float > > feature_values;
	vector< vector< float > > feature_scores;
	vector< float > total_scores;


	feature_names.resize(num_peaks);
	feature_values.resize(num_peaks);
	feature_scores.resize(num_peaks);
	total_scores.resize(num_peaks,0);
	

	const Peptide& pep = sol.pep;
	const mass_t pm_with_19 = sol.pm_with_19;
	const int spec_charge = sol.charge;
	const int mobility = get_proton_mobility(pep,spec_charge);
	const int size_idx =  get_size_group(spec_charge,pm_with_19);
	
	if (! partition_models[spec_charge][size_idx][mobility])
	{
		cout << "Error: no rank partition model for " <<
			spec_charge << " " << size_idx << " " << mobility << endl;
		exit(1);
	}

	if (size_idx != 1 || mobility != 1)
		return false;

	const mass_t min_detected_mass = calc_min_detected_mass(pm_with_19, spec_charge);
	const mass_t max_detected_mass = get_max_detected_mass();


	const vector<int>& amino_acids = pep.get_amino_acids();
	vector<mass_t> exp_cuts;

	pep.calc_expected_breakage_masses(config,exp_cuts);

	const mass_t n_mass = pep.get_n_gap();

	// calculate a single set of ranks across the combined set of fragments
	const int start_cut_idx = (sol.reaches_n_terminal ? 1 : 0);
	const int last_cut_idx  = (sol.reaches_c_terminal ? exp_cuts.size()-1 : exp_cuts.size());
	const mass_t c_mass = exp_cuts[exp_cuts.size()-1];


	int max_l=0;
	for (i=0; i<sel_frags.size(); i++)
	{
		const int frag_idx=sel_frags[i];
		const int cut_idx = sel_idxs[i];
	
		const FragmentType& fragment = config->get_fragment(frag_idx);

		const mass_t cut_mass = exp_cuts[cut_idx];
		const mass_t peak_mass = fragment.calc_expected_mass(cut_mass,pm_with_19);
		
		RankBoostSample rbs;

		for (f=0; f<num_frags; f++)
			if (ppp.frag_idxs[f] == frag_idx)
				break;

	//	cout << "Frag: " << fragment.label << " fi:" << frag_idx << " f:" << f << endl;

		if (f==num_frags)
		{
			cout << "Error: bad frag!!!!" << endl;
			exit(1);
		}
		

		partition_models[spec_charge][size_idx][mobility]->fill_combined_simple_peak_features(
			this, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
				
//		partition_models[spec_charge][size_idx][mobility]->fill_combined_peak_features(	
//			this, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
			
		total_scores[i] = partition_models[spec_charge][size_idx][mobility]->combined_frag_boost_model.calc_rank_score_with_details(
									rbs,feature_names[i],feature_values[i],feature_scores[i]);
							
			
		if (feature_names[i].size()>max_l)
			max_l = feature_names[i].size();
	}


	cout << "Size: " << size_idx << " Mobility: " << mobility << endl;


	// print results
	for (i=0; i<num_peaks; i++)
	{
		cout << config->get_fragment(sel_frags[i]).label << " " <<
			sel_idxs[i];
		
		if (i<num_peaks-1)
		{
			cout << " & ";
		}
		else
			cout << "\\\\" << endl;
	}

	cout << setprecision(2) << fixed;
	for (i=0; i<num_peaks; i++)
	{
		cout << total_scores[i];
		if (i<num_peaks-1)
		{
			cout << " & ";
		}
		else
			cout << "\\\\" << endl;
	}

	for (i=0; i<num_peaks; i++)
	{
		cout << intensities[i];
		if (i<num_peaks-1)
		{
			cout << " & ";
		}
		else
			cout << "\\\\" << endl;
	}

	for (i=0; i<max_l; i++)
	{
		int j;
		for (j=0; j<num_peaks; j++)
		{
			if (feature_names[j].size()<=i)
			{
				cout << "           &  ";  
			}
			else
			{
				cout << feature_names[j][i] << " " << feature_values[j][i] << " & ";
				if (feature_scores[j][i]>0)
				{
					cout << "+";
				}
				cout << feature_scores[j][i];
			}

			if (j<num_peaks-1)
			{
				cout << " & ";
			}
			else
				cout << "\\\\" << endl;
		}
	}



	return true;
}


⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -