📄 partitionmodel.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
#include "PeakRankModel.h"





/*****************************************************************************
Creates the phi domain for the rank boost training.
phi contains pairs of idx0,idx1 and weight. 
idx0 and idx1 are peak sample indexes while weight is the relative intensity
between them. The weight is normalized according to the maximal intensity of a
peak from the same frag type in the peptide (e.g., the strongest y is given 
intensity 1 while a missing y is given intensity 0. The weight of x,y is set to 
intensity[y]-intensity[x] (it is assumed that for all pairs, y is stronger than x).
******************************************************************************/
void create_phi_list_from_samples(const vector<float>&     peak_intens,
								  const vector<PeakStart>& peak_starts,
								  const vector<float>&     max_annotated_intens,
								  vector<SamplePairWeight>& phi)
{
	const weight_t min_norm_weight = 0.1;

	int i;
	phi.clear();
	for (i=0; i<peak_starts.size(); i++)
	{
		const int start_idx = peak_starts[i].peak_start_idx;
		const int num_peaks = peak_starts[i].num_peaks;
	
		vector<float> intens;

		intens.resize(num_peaks,-1);
		float max_inten = -1;

		int j;
		for (j=0; j<num_peaks; j++)
		{
			intens[j]=peak_intens[start_idx+j];
			if (intens[j]>max_inten)
				max_inten=intens[j];
		}

		if (max_inten<=0)
		{
//			cout << "Error: could not find a max inten for peaks from sample " << i << endl;
//			exit(1);
			continue;
		}

		const float one_over_max = 1.0/max_inten;
		for (j=0; j<num_peaks; j++)
			intens[j] *= one_over_max;

		const float max_annotated_inten = max_annotated_intens[i]*one_over_max;

		// look at all pairs, only consider pairs for which inten[idx1]>inten[idx0]
		for (j=0; j<num_peaks; j++)
		{
			int k;
			for (k=0; k<num_peaks; k++)
				if (intens[k]>intens[j])
				{
					// ignore a pair of peaks if the difference between them is less than
					// 0.15 of the maximum intensity (there might be too many misordered 
					// pairs in that range
					if ( (intens[j]/intens[k]) > 0.85)
						continue;

					// calc weight of sample
					// hlaf is relative weight of strong peak (compared to all annotated peaks
					// in the spectrum). the other half is relative to the difference between
					// the two peaks being compared).

				
					weight_t pair_weight = 0.6*(intens[k]-intens[j]) + 0.4 *intens[k]/max_annotated_inten;
					if (pair_weight>1.0)
						pair_weight = 1.0;

					if (pair_weight<min_norm_weight)
						continue;

					// don't let very weak peaks participate in a missing peak pair
					if (intens[j]==0 && pair_weight<0.25)
						continue;

					phi.push_back(SamplePairWeight(start_idx+j,start_idx+k,pair_weight));
				}
		}
	}

	cout << "Created " << phi.size() << " pairs for " << peak_starts.size() << " peptide samples" << endl;
}


/******************************************************************************
The main function at the partition model level for calculating the peaks' rank
scores. Returns results in the PeptidePeakPrediction structure, in there there
is a 2D table of score (rows are the fragment type, columns are the cut idxs).
number of rows equals the number of fragment idxs in the ppp
*******************************************************************************/
void PartitionModel::calc_peptides_peaks_rank_scores(
									  const PeakRankModel *prank,
									  const PeptideSolution& sol,
									  mass_t min_detected_mass, 
									  mass_t max_detected_mass,
									  PeptidePeakPrediction& ppp,
									  int   feature_set_type,
									  const vector<int>* ptr_frag_type_idxs) const
{
	Config *config = prank->get_config();
	const int num_frags =		 fragment_type_idxs.size();
	const mass_t mass_with_19  = sol.pm_with_19;
	const Peptide& pep = sol.pep;
	const vector<int>& amino_acids = pep.get_amino_acids();
	vector<mass_t> exp_cuts;

	pep.calc_expected_breakage_masses(config,exp_cuts);

	ppp.amino_acids = pep.get_amino_acids();
	ppp.num_frags = num_frags;
	ppp.frag_idxs = fragment_type_idxs;
	ppp.rank_scores.resize(fragment_type_idxs.size());

	int f;
	const mass_t n_mass = pep.get_n_gap();

	// calculate a single set of ranks across the combined set of fragments
	if (this->feature_set_type >= 3)
	{

		const int start_cut_idx = (sol.reaches_n_terminal ? 1 : 0);
		const int last_cut_idx  = (sol.reaches_c_terminal ? exp_cuts.size()-1 : exp_cuts.size());
		const mass_t n_mass = exp_cuts[0];
		const mass_t c_mass = exp_cuts[exp_cuts.size()-1];

		ppp.combined_peak_scores.clear();
		ppp.frag_idxs = fragment_type_idxs;
		for (f=0; f<num_frags; f++)
		{
			const int frag_idx = fragment_type_idxs[f];
			const FragmentType& fragment = config->get_fragment(frag_idx);

			ppp.rank_scores[f].resize(exp_cuts.size(),NEG_INF);
			
			int cut_idx;
			for (cut_idx=start_cut_idx; cut_idx<last_cut_idx; cut_idx++)
			{
				const mass_t cut_mass = exp_cuts[cut_idx];
				const mass_t peak_mass = fragment.calc_expected_mass(cut_mass,mass_with_19);
				if (peak_mass<min_detected_mass || peak_mass>max_detected_mass)
					continue;

				
				RankBoostSample rbs;

				if (feature_set_type == 3)
				{
					fill_combined_peak_features(prank, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
				}
				else if (feature_set_type == 4)
				{
					fill_combined_dnv_peak_features(prank, n_mass, c_mass, amino_acids, cut_idx, 
						cut_mass, sol, fragment, f, rbs);
				}
				else if (feature_set_type == 5)
				{
					fill_combined_simple_peak_features(prank, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
				}
				else
				{
					cout << "Error: feature set type not supported: " << feature_set_type << endl;
					exit(1);
				}
				ppp.rank_scores[f][cut_idx]= this->combined_frag_boost_model.calc_rank_score(rbs);
			}
		}
		return;
	}


	// calculate separate sets of ranks for each fragment type
	if (this->feature_set_type<=2)
	{
		for (f=0; f<num_frags; f++)
		{
			const int frag_idx = fragment_type_idxs[f];

			// only compute results for frags in the list
			if (ptr_frag_type_idxs)
			{
				int i;
				for (i=0; i<(*ptr_frag_type_idxs).size(); i++)
					if ( (*ptr_frag_type_idxs)[i] == frag_idx)
						break;
				if (i==(*ptr_frag_type_idxs).size())
					continue;
			}

			ppp.rank_scores[frag_idx].resize(exp_cuts.size(),NEG_INF);

			const FragmentType& fragment = config->get_fragment(frag_idx);
			int cut_idx;
			for (cut_idx=1; cut_idx<exp_cuts.size()-1; cut_idx++)
			{
				const mass_t cut_mass = exp_cuts[cut_idx];
				const mass_t peak_mass = fragment.calc_expected_mass(cut_mass,mass_with_19);
				if (peak_mass<min_detected_mass || peak_mass>max_detected_mass)
					continue;

				RankBoostSample rbs;
		
				if (feature_set_type == 0)
				{
					prank->fill_simple_peak_features(amino_acids, cut_idx, 
						cut_mass, mass_with_19, charge, fragment, rbs);
				}
				else if (feature_set_type == 1)
				{
					prank->fill_advanced_peak_features(amino_acids, cut_idx, 
						cut_mass, mass_with_19, charge, fragment, rbs);
				}
				else if (feature_set_type == 2)
				{
					prank->fill_partial_denovo_peak_features(pep.get_n_gap(), exp_cuts[exp_cuts.size()-1],
						amino_acids,cut_idx, cut_mass, 
						mass_with_19, charge, fragment, ppp.most_basic_missing_on_n, 
						ppp.most_basic_missing_on_c , rbs);
				}
				else
				{
					cout << "Error: feature set type not supported: " << feature_set_type << endl;
					exit(1);
				}

				ppp.rank_scores[frag_idx][cut_idx]=frag_models[f].calc_rank_score(rbs);
			}
		}
	}
}


int PartitionModel::read_partition_model(const string& path, Config *config,
										 int _charge, int _size_idx, int _mobility)
{
	const int max_global_frag_idx = config->get_all_fragments().size();

	charge = _charge;
	size_idx = _size_idx;
	mobility   = _mobility;
	max_frag_idx = -1;

	fragment_type_idxs.clear();
	frag_models.clear();

	if (this->feature_set_type <=2)
	{

		int f;
		for (f=0; f<max_global_frag_idx; f++)
		{
			ostringstream oss;
			oss << path << "_" << charge << "_" << this->size_idx << "_" << mobility <<
				"_" << f << ".txt";

	//		cout << "checking: " << oss.str();
			
			ifstream boost_file(oss.str().c_str());

			if (! boost_file.is_open()  ||! boost_file.good())
			{
		//		cout << " -" << endl;
				continue;
			}

			
			fragment_type_idxs.push_back(f);

			max_frag_idx=f;
			
			RankBoostModel rbm;

			frag_models.push_back(rbm);
			frag_models[frag_models.size()-1].read_rankboost_model(boost_file);
			boost_file.close();
		//	cout << " +" << endl;
		}
		return fragment_type_idxs.size();
	}
	else
	{
		if (read_combined_partition_model(path,config,_charge,_size_idx,_mobility)>0)
			return 1;
		return 0;
	}
	
}


/*********************************************************************

**********************************************************************/
int PartitionModel::write_partition_model(const string& path)
{
	int f;
	int num_written=0;
	if (this->feature_set_type <= 2)
	{
		for (f=0; f<this->fragment_type_idxs.size(); f++)
		{
			const int frag_idx = fragment_type_idxs[f];

			if (! frag_models[f].get_ind_was_initialized())
				continue;

			ostringstream oss;
			oss << path << "_" << charge << "_" << this->size_idx << "_" << mobility <<
				"_" << frag_idx << ".txt";

			cout << "Writing: " << oss.str() << endl;
			
			fstream boost_file(oss.str().c_str(),ios::out);

			if (! boost_file.is_open() || ! boost_file.good())
			{
				cout << "Error: couldn't open file for wiriting!" << endl;
				exit(1);
			}
			
			frag_models[f].write_rankboost_model(boost_file,true);
			boost_file.close();
			num_written++;
		}
		return num_written;
	}
	else
		return write_combined_partition_model(path);

	return num_written;
}




/*****************************************************************************
Takes as an input a training file of peptide samples.
For each fragment type with sufficient count, it creates a dataset and
trains the rank model.
******************************************************************************/
void PartitionModel::train_partition_model(
									   PeakRankModel *prank, 
									   char *sample_file_path,
									   int	_charge,
									   int  _size_idx,
									   int  _mobility,
									   int frag_idx_to_train,
									   char *report_dir,
									   int max_num_rounds,
									   char *test_set_file,
									   int   test_peptide_length,
									   char *stop_signal_file,
									   weight_t max_weight_ratio)
{
	const float min_ratio_detected = 0.15;
	Config *config = prank->get_config();
	const vector<FragmentType>& all_fragments = config->get_all_fragments();
	const vector<mass_t>& aa2mass = config->get_aa2mass();
	const int num_frags = all_fragments.size();

	if (max_num_rounds<0)
		max_num_rounds = 1000;

	charge = _charge;
	size_idx = _size_idx;
	mobility   = _mobility;
	
	vector<TrainingPeptide> sample_tps, test_tps;
	vector<int> frag_counts;
	vector<int> frag_detected;
	vector<int> length_counts;

	read_training_peptides_from_file(sample_file_path,sample_tps);

	cout << "Read " << sample_tps.size() << " training tps...";

	int num_tps_to_add = 0;

	if (prank->get_feature_set_type() == 2)
	{
		if (sample_tps.size()<25000)
			num_tps_to_add = 1;
		if (sample_tps.size()<15000)
			num_tps_to_add = 2;
		if (sample_tps.size()<10000)
			num_tps_to_add = 3;
		if (sample_tps.size()<5000)
			num_tps_to_add = 4;

		cout << "Adding at most " << num_tps_to_add << " per tp." << endl;
	}

	if (prank->get_feature_set_type() == 2)
		convert_tps_to_partial_denovo(config,sample_tps,num_tps_to_add);

	if (test_set_file)
	{
		read_training_peptides_from_file(test_set_file, test_tps);
		cout << "Read " << test_tps.size() << " test tps...";
		if (prank->get_feature_set_type() == 2)
			convert_tps_to_partial_denovo(config,test_tps,num_tps_to_add);
	}

	// Create intial report on dataset
	frag_counts.resize(num_frags,0);
	frag_detected.resize(num_frags,0);
	length_counts.resize(200,0);

	int numH=0,numK=0, numR=0;
	int i;
	for (i=0; i<sample_tps.size(); i++)
	{
		const TrainingPeptide& tp = sample_tps[i];
		int f;

		for (f=0; f<tp.intens.size(); f++)
		{
			int j;
			for (j=1; j<tp.intens[f].size(); j++)
			{
				if (tp.intens[f][j]>=0)
					frag_counts[tp.frag_idxs[f]]++;
				if (tp.intens[f][j]>0)
					frag_detected[tp.frag_idxs[f]]++;
			}
		}

		int j;
		for (j=0; j<tp.amino_acids.size(); j++)
		{
			if (tp.amino_acids[j]==His)
				numH++;
			if (tp.amino_acids[j]==Lys)
				numK++;
			if (tp.amino_acids[j]==Arg)
				numR++;
		}
		length_counts[tp.amino_acids.size()]++;
	}

	// report and select fragments for training
	cout << "# training peptides: " << sample_tps.size() << endl;
	cout << "Avg #R: " << numR/(double)sample_tps.size() << endl;
	cout << "Avg #K: " << numK/(double)sample_tps.size() << endl;
	cout << "Avg #H: " << numH/(double)sample_tps.size() << endl;

	cout << endl << "Sample lengths:" << endl;
	for (i=0; i<length_counts.size(); i++)
		if (length_counts[i]>0)
			cout << i << "\t" << length_counts[i] << endl;
	cout << endl;
		
	for (i=0; i<all_fragments.size(); i++)
	{
		float ratio = (float)frag_detected[i]/frag_counts[i];
		cout << all_fragments[i].label << "\t" << frag_detected[i] << " / " << 
			frag_counts[i] << "\t = " << setprecision(3) << ratio << endl;

		if (ratio>=min_ratio_detected)
		{
			if (frag_idx_to_train<0 || frag_idx_to_train == i)
				fragment_type_idxs.push_back(i);
		}
	}
	cout << endl;

	cout << "Max weight ratio: " << max_weight_ratio << endl;

	if (fragment_type_idxs.size() == 0)
	{
		cout << "No models to train!" << endl;
		return;
	}


	// Train each selected model

	frag_models.resize(fragment_type_idxs.size());
	int f;
	for (f=0; f<fragment_type_idxs.size(); f++)
	{
		const int frag_idx = fragment_type_idxs[f];

		if (frag_idx>0 && frag_idx != frag_idx_to_train)
			continue;

		const int frag_charge = all_fragments[frag_idx].charge;

		cout << "Training frag " << frag_idx << " (" << config->get_fragment(frag_idx).label <<")" << endl;
		
		// fill RankBoostSamples and create rank ds
		RankBoostDataset         rank_ds,test_ds;
		
		vector<float>			 peak_intens;
		vector<PeakStart>        peak_starts;
		vector<float>			 max_annotated_intens;

		// Train RankBoostModel

		cout << "TRAINING..." << endl;

		// initialize and read the test set if it exists
		RankBoostDataset *test_set_ptr=NULL;
		if (test_set_file)
		{
			vector<float>			 test_peak_intens;
			vector<PeakStart>        test_peak_starts;
			vector<float>			 test_max_annotated_intens;

			cout << "Reading test tps..." << endl; 

			prank->read_training_peptides_into_rank_boost_dataset(frag_idx, charge, 
				test_tps, test_ds, test_peak_intens, test_peak_starts, 
				test_max_annotated_intens);

			cout << "Creating test phi list..." << endl;

			create_phi_list_from_samples(test_peak_intens, test_peak_starts, 
				test_max_annotated_intens, test_ds.get_non_const_phi_support());

			test_ds.compute_total_phi_weight();
			test_set_ptr = &test_ds;

			// choose length (try shorte peptide if not eonough samples, go for the max)
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -