📄 rankboost.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
#include "RankBoost.h"
#include "auxfun.h"


bool RankBoostModel::read_rankboost_model(istream& is)
{
	num_binary_features = 0;
	num_real_features =0;
	int num_non_zero_real_features=0;

	is >> num_binary_features;
	if (num_binary_features>0)
	{
		binary_feature_names.resize(num_binary_features);
		binary_weights.resize(num_binary_features,0);
		ind_active_binary_feature.resize(num_binary_features,true);

		int i;
		for (i=0; i<num_binary_features; i++)
		{
			int idx=-1;
			is >> idx >> binary_weights[i] >> binary_feature_names[i];
			if (idx<0 || binary_weights[i] == 0)
			{
				cout << "Error: reading binary weight line " << i << endl;
				exit(1);
			}
		}
	}

	is >> num_real_features;
	is >> num_non_zero_real_features;
	if (num_real_features>0)
	{
		real_feature_names.resize(num_real_features);
		real_default_weights.resize(num_real_features,0);
		real_weights.resize(num_real_features);
		real_limits.resize(num_real_features);
		ind_active_real_feature.resize(num_real_features,false);
		non_zero_real_idxs.clear();
	
		int i;
		for (i=0; i<num_non_zero_real_features; i++)
		{
			int num_weights=0;
			int num_limits=0;
			int j;

			char buff[1024];

			is.getline(buff,1024);
			is.getline(buff,1024);
			int feature_idx;
			double weight;
			if (sscanf(buff,"%d\t%lf",&feature_idx,&weight) != 2)
			{
				cout << "Error: bad line in model file:" << endl << buff << endl;
				exit(1);
			}

			real_default_weights[feature_idx]=weight;
			
			const int len = strlen(buff);
			int tab_count=0;
			for (j=0; j<len && tab_count<2; j++)
				if (buff[j]=='\t')
					tab_count++;

			if (tab_count<2)
			{
				cout << "Error: bad tab count!" << endl;
				exit(1);
			}

			real_feature_names[feature_idx] = string(buff+j);

			ind_active_real_feature[feature_idx] = true;
			non_zero_real_idxs.push_back(feature_idx);

			string &fn = real_feature_names[feature_idx];

			is >> num_limits;
			real_limits[feature_idx].resize(num_limits,NEG_INF);

			for (j=0; j<num_limits; j++)
				is >> real_limits[feature_idx][j];

			is >> num_weights;
			real_weights[feature_idx].resize(num_weights,0);
			for (j=0; j<num_weights; j++)
				is >> real_weights[feature_idx][j];
		}

		// this last part is done to move to the next line (so the next getline gets a new line)
		char buff[16];
		is.getline(buff,10);
		if (strlen(buff)>1)
		{
			cout << "Error: bad parsing of RankBoostModel!" << endl;
			exit(1);
		}
	}

	sort(non_zero_real_idxs.begin(),non_zero_real_idxs.end());

	this->total_default_weight=0;
	int i;
	for (i=0; i<num_real_features; i++)
		total_default_weight+=real_default_weights[i];

	ind_was_initialized=true;

	return true;
}


void RankBoostModel::write_rankboost_model(ostream& os, bool ind_compress)
{
	// compresses the weights and limits before writing
	if (ind_compress)
		compress_real_limits_and_weights();

	if (! ind_was_initialized)
	{
		cout << "Error: trying to write an unitialized RankBoost model!" << endl;
		exit(1);
	}

	int i;
	int num_non_zero_bin = 0;
	for (i=0; i<num_binary_features; i++)
		if (binary_weights[i] != 0)
			num_non_zero_bin++;

	os << num_non_zero_bin << endl;
	os << setprecision(8);

	for (i=0; i<num_binary_features; i++)
		if (binary_weights[i] != 0)
			os << i << "\t" << binary_weights[i] << "\t" << binary_feature_names[i] << endl;

	// assumes the limits and weights were already compressed, otherwise the model
	// will be very large
	int num_non_zero_real = 0;
	for (i=0; i<num_real_features; i++)
		if (real_weights[i].size()>0)
		{
			num_non_zero_real++;
		}

	os << num_real_features << endl;
	os << num_non_zero_real << endl;
	for (i=0; i<num_real_features; i++)
	{
		if ( real_weights[i].size() == 0 )
			continue;

		os << i << "\t" << real_default_weights[i] << "\t" << real_feature_names[i] << endl;
		os << real_limits[i].size();
		int j;
		for (j=0; j<real_limits[i].size(); j++)
			os << "\t" << real_limits[i][j];
		os << endl;
		os << real_weights[i].size();
		for (j=0; j<real_weights[i].size(); j++)
			os << "\t" << real_weights[i][j];
		os << endl;
	}
}


/***********************************************************************
// compresses the number of limits and weights to represent steps in the weight
// function without repeatition of the same weights in different limits
************************************************************************/
void RankBoostModel::compress_real_limits_and_weights()
{
	int i;

	for (i=0; i<num_real_features; i++)
	{
		if (real_limits[i].size() < 1)
		{
			real_weights[i].clear();
			real_limits[i].clear();
			continue;
		}
		else
		{
			int j;
			if (real_default_weights[i]==0)
			{
				for (j=0; j<real_weights[i].size(); j++)
					if (real_weights[i][j] != 0)
						break;
				if (j== real_weights[i].size())
				{
					real_weights[i].clear();
					real_limits[i].clear();
					continue;
				}
			}
		}

		vector<float> new_limits;
		vector<weight_t> new_weights;

		new_weights.push_back(real_weights[i][0]);
		new_limits.push_back(real_limits[i][0]);

		int j;
		for (j=1; j<real_limits[i].size(); j++)
		{
			if (real_weights[i][j] != real_weights[i][j+1])
			{
				new_weights.push_back(real_weights[i][j]);
				new_limits.push_back(real_limits[i][j]);
			}
		}
		new_weights.push_back(real_weights[i][j]); // last weight is for values larger than the last limit

		real_limits[i]=new_limits;
		real_weights[i]=new_weights;
	}

	remove_default_weights();
}

/*************************************************************************
Removes the default weight by subratcing them from everybody else
Also moves the min/max values towards 0 if they are all above or below it
**************************************************************************/
void RankBoostModel::remove_default_weights()
{
	int i;
	for (i=0; i<num_real_features; i++)
	{
		if (this->real_default_weights[i] != 0)
		{
			int j;
			for (j=0 ;j<real_weights[i].size(); j++)
				real_weights[i][j] -= real_default_weights[i];
			real_default_weights[i]=0;
		}
	}
}

/*************************************************************************
**************************************************************************/
double RankBoostModel::calc_training_rank_error(const RankBoostDataset& training_ds) const
{
	const int num_samples = training_ds.get_num_samples();
	const vector<RankBoostSample>& samples = training_ds.get_samples();

	vector<weight_t> rank_scores;

	rank_scores.resize(num_samples);

	int i;
	for (i=0; i<num_samples; i++)
		rank_scores[i]=calc_rank_score(samples[i]);

	const vector<SamplePairWeight>& phi_support = training_ds.get_phi_support();
	double train_error=0;
	for (i=0; i<phi_support.size(); i++)
	{
		if (rank_scores[phi_support[i].idx1]>=rank_scores[phi_support[i].idx2])
			train_error+=phi_support[i].weight;
	}

	return train_error/training_ds.get_total_phi_weight();
}


struct rank_score_pair {
	rank_score_pair() : rank(999), score(NEG_INF) {};
	rank_score_pair(int _r, float _s) : rank(_r), score(_s) {};

	bool operator< (const rank_score_pair& other) const
	{
		return (score>other.score);
	}
	int rank;
	float score;
};

/***************************************************************************
if tag3 != -1, it is used to filter out samples that don't have the same tag value.
Calculates the weighted ranking error for a given dataset.
Also calculates the error in peak ranking for all peptides:
Avg predicted rank for strongest peak and sd of prediction for strongest peaks,
Avg predicted rank fro 2nd strongest peak...
****************************************************************************/
double RankBoostModel::calc_prediction_error(const RankBoostDataset& rank_ds, 
								 vector<peak_rank_stat>& peak_stats,
								 int tag3_filter_val,
								 int *ptr_num_groups_tested) const
{
	const int num_samples = rank_ds.get_num_samples();
	const vector<RankBoostSample>& samples = rank_ds.get_samples();
	const int num_groups = rank_ds.get_num_groups();

	
	vector<weight_t> rank_scores;

	rank_scores.resize(num_samples,NEG_INF);

	if (ptr_num_groups_tested)
	{
		vector<bool> group_inds;
		group_inds.resize(num_groups,false);
		int i;
		for (i=0; i<num_samples; i++)
			if (tag3_filter_val<0 || samples[i].tag3 == tag3_filter_val)
			{
				rank_scores[i]=calc_rank_score(samples[i]);
				group_inds[samples[i].group_idx]=true;
			}
		*ptr_num_groups_tested=0;
		for (i=0; i<group_inds.size(); i++)
			if (group_inds[i])
				(*ptr_num_groups_tested)++; 
	}
	else
	{
		int i;
		for (i=0; i<num_samples; i++)
			if (tag3_filter_val<0 || samples[i].tag3 == tag3_filter_val)
				rank_scores[i]=calc_rank_score(samples[i]);
	}

	int i;
	const vector<SamplePairWeight>& phi_support = rank_ds.get_phi_support();
	double error=0;
	double phi_weight=0;
	for (i=0; i<phi_support.size(); i++)
	{
		const int idx1 = phi_support[i].idx1;
		const int idx2 = phi_support[i].idx2;

		if (rank_scores[idx1]<=NEG_INF || rank_scores[idx2]<=NEG_INF)
			continue;
		
		if (rank_scores[idx1]>=rank_scores[idx2])
			error+=phi_support[i].weight;

		phi_weight += phi_support[i].weight;
	}
	double weighted_error = error/phi_weight;

//	cout << ">>>" << tag3_filter_val << "\t" << error << "\t" << phi_weight << endl;

	vector< vector<rank_score_pair> > peptide_peak_ranks;
	peptide_peak_ranks.clear();
	peptide_peak_ranks.resize(num_groups);

	vector< vector<int> > peak_predictions; // org rank / predcited rank
	peak_predictions.resize(100); // upto 100 strong peaks...
		for (i=0; i<100; i++)
		peak_predictions[i].resize(100,0);

	for (i=0; i<num_samples; i++)
	{
		if (rank_scores[i]<=NEG_INF)
			continue;

		const RankBoostSample& sam = samples[i];
		peptide_peak_ranks[sam.group_idx].push_back(rank_score_pair(sam.rank_in_group,rank_scores[i]));
	}

	for (i=0; i<peptide_peak_ranks.size(); i++)
	{
		if (peptide_peak_ranks[i].size() == 0)
			continue;

		sort(peptide_peak_ranks[i].begin(),peptide_peak_ranks[i].end());

		int j;
		for (j=0; j<peptide_peak_ranks[i].size(); j++)
		{
			const int org_rank = peptide_peak_ranks[i][j].rank;
			const int predicted_rank = (j > 99 ? 99 : j);

			if (org_rank<100)
				peak_predictions[org_rank][predicted_rank]++;
		}
	}


	peak_stats.clear();

	for (i=0; i<100; i++)
	{
		vector<int> vals,counts;
		int peak_count=0;
		int j;
		for (j=0; j<peak_predictions[i].size(); j++)
			peak_count += peak_predictions[i][j];

		if (peak_count < 5)
			break;

		double mean=0,sd=-1; 
		for (j=0; j<peak_predictions[i].size(); j++)
		{
			if (peak_predictions[i][j]>0)
			{
				vals.push_back(j);
				counts.push_back(peak_predictions[i][j]);
			}
		}
		calc_mean_sd_from_counts(vals,counts,&mean,&sd);

		peak_rank_stat prs;

		prs.true_rank=i;
		prs.avg_precited_rank = mean;
		prs.sd_predicted_rank = sd;
		prs.precent_predicted_correctly = peak_predictions[i][i]/(float)peak_count;
		
		peak_stats.push_back(prs);
	}
	
	return weighted_error;
}

/*************************************************************************
**************************************************************************/
weight_t RankBoostModel::calc_rank_score(const RankBoostSample& sample) const
{
	const vector<int>& binary_idxs = sample.binary_non_zero_idxs;
	weight_t score=0;

	if (! ind_was_initialized)
	{
		cout << "Error: RankBoostModel not initialized!" << endl;
		exit(1);
	}

	int i;
	for (i=0; i<binary_idxs.size(); i++)
		score += binary_weights[binary_idxs[i]];

	// first add all no vote scores
	score += total_default_weight;
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -