📄 peakrank_combined.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
#include "PeakRankModel.h"
#include "auxfun.h"

extern const int num_RKH_combos;
extern const int num_RKH_pairs;
extern const string combos[];
extern const float RKH_pair_matrix[6][6];


/************************************************************************

  A note about the different structure of functions/classes when the 
  combined features are used. Since each partition has different fragments,
  which induce different fragment names, most of the model informaiton was
  pushed into the partition level, rather than the PeakRankModel level, with
  the other models that looked at each fragment independently.
*************************************************************************/




int PartitionModel::read_combined_partition_model(const string& path, Config *config,
							 int _charge, int _size_idx, int _mobility)
{
	const int max_global_frag_idx = config->get_all_fragments().size();

	charge = _charge;
	size_idx = _size_idx;
	mobility   = _mobility;
	max_frag_idx = -1;

	fragment_type_idxs.clear();
		
	ostringstream oss;
	oss << path << "_" << charge << "_" << this->size_idx << "_" << mobility <<	"_model.txt";
	
	ifstream boost_file(oss.str().c_str());

	if (! boost_file.is_open()  ||! boost_file.good())
	{
		cout << "Warnind: Couldn\'t find " << oss.str() << endl;
		return 0;
	}

	// read feature type and frags
	char buff[256];
	boost_file.getline(buff,256);
	this->feature_set_type = NEG_INF;
	int num_frags = NEG_INF;
	istringstream iss(buff);

	iss >> feature_set_type >> num_frags;
	if (this->feature_set_type <=2)
	{
		cout << "Error: read_combined_partition_model works only on frag_set 3 and up!" << endl;
		exit(1);
	}

	int f;
	for (f=0; f<num_frags; f++)
	{
		int frag=NEG_INF;
		iss >> frag;
		if (frag<0)
		{
			cout << "Error: bad frags in combined model!" << endl;
			exit(1);
		}
		fragment_type_idxs.push_back(frag);
		if (frag>max_frag_idx)
			max_frag_idx = frag;
	}
			
	combined_frag_boost_model.read_rankboost_model(boost_file);
	boost_file.close();

	num_features_per_frag= (combined_frag_boost_model.get_num_real_features()-1) / 
						   fragment_type_idxs.size();
	
	return fragment_type_idxs.size();
}
	

int PartitionModel::write_combined_partition_model(const string& path)
{
	if (this->feature_set_type <=2)
	{
		cout << "Error: read_combined_partition_model works only on frag_set 3 and up!" << endl;
		exit(1);
	}

	ostringstream oss;
	fstream boost_file(path.c_str(),ios::out);

	if (! boost_file.is_open() || ! boost_file.good())
	{
		cout << "Error: couldn't open file for wiriting!" << endl;
		exit(1);
	}

	
	boost_file << feature_set_type << " " << fragment_type_idxs.size();
	int f;
	for (f=0; f<fragment_type_idxs.size(); f++)
		boost_file << " " << fragment_type_idxs[f];
	boost_file << endl;
		
	combined_frag_boost_model.write_rankboost_model(boost_file,true);

	return 1;
}


/*****************************************************************************
Creates the phi domain for the rank boost training.
phi contains pairs of idx0,idx1 and weight. 
idx0 and idx1 are peak sample indexes while weight is the relative intensity
between them. The weight is normalized according to the maximal intensity of a
peak from the same frag type in the peptide (e.g., the strongest y is given 
intensity 1 while a missing y is given intensity 0. The weight of x,y is set to 
intensity[y]-intensity[x] (it is assumed that for all pairs, y is stronger than x).
******************************************************************************/
void create_phi_list_from_combined_peak_samples(const vector<float>& peak_intens,
												const vector<int>  & peak_frag_types,    
											    const vector<PeakStart>& peak_starts,
												const int max_frag_idx,
												vector<SamplePairWeight>& phi)
{
	const weight_t min_norm_weight = 0.2;
	int i;
	phi.clear();
	for (i=0; i<peak_starts.size(); i++)
	{
		const int start_idx = peak_starts[i].peak_start_idx;
		const int num_peaks = peak_starts[i].num_peaks;
	
		vector<float> max_intens_per_type;
		max_intens_per_type.resize(max_frag_idx+1,0); // 

		vector<float> intens;
		intens.resize(num_peaks,-1);
		float max_inten = -1;
		int j;
		for (j=0; j<num_peaks; j++)
		{
			const int   peak_idx = start_idx+j;
			const float peak_inten = peak_intens[peak_idx];
			const int   peak_frag_type = peak_frag_types[peak_idx];
				
			intens[j] = peak_inten;
			if (peak_inten>max_inten)
				max_inten=peak_inten;
			if (peak_inten>max_intens_per_type[peak_frag_type])
				max_intens_per_type[peak_frag_type]=peak_inten;
		}

		if (max_inten<=0)
		{
			continue;
		}

		const float one_over_max = 1.0/max_inten;
		for (j=0; j<num_peaks; j++)
			intens[j] *= one_over_max;

		for (j=0; j<=max_frag_idx; j++)
			max_intens_per_type[j]*= one_over_max;


		// look at all pairs, only consider pairs for which inten[idx1]>inten[idx0]
		for (j=0; j<num_peaks; j++)
		{
			int k;
			for (k=0; k<num_peaks; k++)
				if (intens[k]>intens[j])
				{
					// ignore a pair of peaks if the difference between them is less than
					// 0.15 of the maximum intensity (there might be too many misordered 
					// pairs in that range
					if ( (intens[j]/intens[k]) > 0.7)
						continue;
				
					weight_t pair_weight = 0.4 + 0.6*intens[k] - 0.4*(intens[j]/intens[k]);
				
					if (pair_weight<min_norm_weight)
						continue;

					// don't let very weak peaks participate in a missing peak pair
					if (intens[j]==0 && intens[k]<0.15)
						continue;

					phi.push_back(SamplePairWeight(start_idx+j,start_idx+k,pair_weight));
				}
		}
	}

	cout << "Created " << phi.size() << " pairs for " << peak_starts.size() << " peptide samples" << endl;
}


void PartitionModel::train_combined_partition_model( 
								const PeakRankModel *prank, 
								char *sample_file_path,
								int	_charge,
								int  _size_idx,
								int  _mobility,
								int  num_frags_to_choose, 
								char *report_dir,
								int  max_num_rounds,
								char *test_set_file,
								int	  test_peptide_length,
								char *stop_signal_file,
								weight_t max_weight_ratio)
{
	const float min_ratio_detected = 0.15;
	Config *config = prank->get_config();
	const vector<FragmentType>& all_fragments = config->get_all_fragments();
	const vector<mass_t>& aa2mass = config->get_aa2mass();
	const int num_all_frags = all_fragments.size();

	if (max_num_rounds<0)
		max_num_rounds = 1000;

	charge = _charge;
	size_idx = _size_idx;
	mobility   = _mobility;
	
	vector<TrainingPeptide> sample_tps, test_tps;
	vector<int> frag_counts;
	vector<int> frag_detected;
	vector<int> length_counts;

	
	if (this->feature_set_type <=2)
	{
		cout << "Error: train_combined_partition_model works only on frag_set 3 and up!" << endl;
		exit(1);
	}

	read_training_peptides_from_file(sample_file_path,sample_tps);

	cout << "Read " << sample_tps.size() << " training tps...";

	int num_tps_to_add = 0; // for de novo models
	if (prank->get_feature_set_type() == 4)
	{
		if (sample_tps.size()<15000)
			num_tps_to_add = 1;
		if (sample_tps.size()<10000)
			num_tps_to_add = 2;
		if (sample_tps.size()<5000)
			num_tps_to_add = 3;

		if (charge>=3 || size_idx>=3)
			num_tps_to_add /= 2;
	
		cout << "Adding at most " << num_tps_to_add << " per tp." << endl;
	}

	if (prank->get_feature_set_type() == 4)
	{
		cout << "Converting train: " << sample_tps.size() << " --> ";
		convert_tps_to_partial_denovo(config,sample_tps,num_tps_to_add,false);
		cout << sample_tps.size() << endl;
	}

	if (test_set_file)
	{
		read_training_peptides_from_file(test_set_file, test_tps);
		cout << "Read " << test_tps.size() << " test tps...";
		if (prank->get_feature_set_type() == 4)
		{
			cout << "Converting train: " << test_tps.size() << " --> ";
			convert_tps_to_partial_denovo(config,test_tps,num_tps_to_add);
			cout << test_tps.size() << endl;
		}
	}

	// Create intial report on dataset
	frag_counts.resize(num_all_frags,0);
	frag_detected.resize(num_all_frags,0);
	length_counts.resize(200,0);

	int numH=0,numK=0, numR=0;
	int i,f;
	for (i=0; i<sample_tps.size(); i++)
	{
		const TrainingPeptide& tp = sample_tps[i];
		int f;
		for (f=0; f<tp.intens.size(); f++)
		{
			int j;
			for (j=1; j<tp.intens[f].size(); j++)
			{
				if (tp.intens[f][j]>=0)
					frag_counts[tp.frag_idxs[f]]++;
				if (tp.intens[f][j]>0)
					frag_detected[tp.frag_idxs[f]]++;
			}
		}
		int j;
		for (j=0; j<tp.amino_acids.size(); j++)
		{
			if (tp.amino_acids[j]==His)
				numH++;
			if (tp.amino_acids[j]==Lys)
				numK++;
			if (tp.amino_acids[j]==Arg)
				numR++;
		}
		length_counts[tp.amino_acids.size()]++;
	}

	// report and select fragments for training
	cout << "# training peptides: " << sample_tps.size() << endl;
	cout << "Avg #R: " << numR/(double)sample_tps.size() << endl;
	cout << "Avg #K: " << numK/(double)sample_tps.size() << endl;
	cout << "Avg #H: " << numH/(double)sample_tps.size() << endl;

	cout << endl << "Sample lengths:" << endl;
	for (i=0; i<length_counts.size(); i++)
		if (length_counts[i]>0)
			cout << i << "\t" << length_counts[i] << endl;
	cout << endl;
		
	vector<float> frag_ratios;
	frag_ratios.resize(all_fragments.size(),0);

	cout << "Detected fragments: " << endl;
	for (i=0; i<all_fragments.size(); i++)
	{
		const float ratio = (frag_counts[i]>0 ?  (float)frag_detected[i]/frag_counts[i] : 0);
		cout << all_fragments[i].label << "\t" << frag_detected[i] << " / " << 
			frag_counts[i] << "\t = " << setprecision(3) << ratio << endl;
		if (ratio>=min_ratio_detected)
			frag_ratios[i]=ratio;
	}
	cout << endl;

	// choose the most abundant fragments
	cout << "Selected: ";
	this->fragment_type_idxs.clear();
	int max_frag_idx=-1;
	for (f=0; f<num_frags_to_choose; f++)
	{
		float max_ratio=0;
		int max_idx=-1;
		int j;
		for (j=0; j<frag_ratios.size(); j++)
			if (frag_ratios[j]>max_ratio)
			{
				max_ratio=frag_ratios[j];
				max_idx=j;
			}
		if (max_idx<0)
			break;

		fragment_type_idxs.push_back(max_idx);
		frag_ratios[max_idx]=-1;
		cout << " " << all_fragments[max_idx].label;
		if (max_idx>max_frag_idx)
			max_frag_idx=max_idx;
	}
	cout << endl;

	if (fragment_type_idxs.size() == 0)
	{
		cout << "No fragments to train!" << endl;
		return;
	}


	if (prank->get_feature_set_type() == 3)
	{
		set_combined_feature_names_in_rankboost_model(prank);
	}
	else if (prank->get_feature_set_type() == 4)
	{
		set_combined_dnv_feature_names_in_rankboost_model(prank);
	}
	else if (prank->get_feature_set_type() == 5)
	{
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -