📄 rankboost.h

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 H
字号:
#ifndef __RANKBOOST_H__
#define __RANKBOOST_H__

#include "includes.h"


struct idx_weight_pair {
	idx_weight_pair() : idx(-1), weight(0) {};
	idx_weight_pair(int _i, weight_t _w) : idx(_i), weight(_w) {};

	bool operator< (const idx_weight_pair& other) const
	{
		return weight>other.weight;
	}

	int idx;
	weight_t weight;
};


struct peak_rank_stat {
	peak_rank_stat() : true_rank(-1), avg_precited_rank(-1), sd_predicted_rank(-1), precent_predicted_correctly(-1) {};

	int true_rank;
	float avg_precited_rank;
	float sd_predicted_rank;
	float precent_predicted_correctly;
};

class RankBoostModel;

struct RankBoostSample {

	RankBoostSample() { clear(); }

	void clear()
	{
		group_idx = (int)NEG_INF;
		rank_in_group = (int)NEG_INF;

		tag1=(int)NEG_INF;
		tag2=(int)NEG_INF;
		tag3=(int)NEG_INF;
		float_tag1=NEG_INF;

		binary_non_zero_idxs.clear();
		binary_novote_idxs.clear();
		real_active_idxs.clear();
		real_active_values.clear();
		real_novote_idxs.clear();
	}

	void add_real_feature(int idx, float val) { 
		real_active_idxs.push_back(idx);
		real_active_values.push_back(val);
	}

	bool get_feature_val(int f_idx, float *val) const;

	void print(ostream& os = cout) const;

	int group_idx;     // for testing purposes
	int rank_in_group; // for testing purposes
	int tag1,tag2,tag3; // extra information that can be stored for debugging
	float float_tag1;
	
	vector<int>	binary_non_zero_idxs; // idxs of all binary variables that have 1
	vector<int> binary_novote_idxs; // idxs for which we should use the default

	vector<int>   real_active_idxs; //
	vector<float> real_active_values; // the real value given to the feature
	vector<int>	  real_novote_idxs; // use the default value with these, should be used
									// only if default value is not 0
};



struct SamplePairWeight {
	SamplePairWeight() : idx1(-1), idx2(-1), tag(-1), weight(0) {};
	SamplePairWeight(int _idx1, int _idx2, weight_t _weight) : idx1(_idx1), idx2(_idx2), weight(_weight) {};
	SamplePairWeight(int _idx1, int _idx2, int _t, weight_t _weight) : idx1(_idx1), idx2(_idx2), tag(_t), 
					weight(_weight) {};

	int   idx1,idx2;
	int	  tag;
	weight_t weight;
};


class RankBoostDataset {

public:

	RankBoostDataset() : num_groups(0), 
						 max_ratio_for_regular_update(10000.0), 
						 total_phi_weight(0) {};

	void     set_max_ratio_for_regular_update(weight_t w) { max_ratio_for_regular_update = w; }
	weight_t get_max_ratio_for_regular_update() const { return max_ratio_for_regular_update; }

	void compute_max_weights_for_normal_update(
		const vector<weight_t>& D0, vector<weight_t>& d) const;

	void compute_total_phi_weight();

	void compute_initial_distribution(vector<weight_t>& d) const;

	void initialize_potenital_lists();

	void initialize_binary_one_lists(int num_binary_features);

	void initialize_binary_ordered_phi_lists(const vector<string>* binary_names = NULL);

	void initialzie_real_feature_table(int num_real_features);

	void initialize_real_vote_lists(const RankBoostModel& rbm);

	void calc_potentials(const vector<weight_t>& D, vector<weight_t>& potentials) const;

	double get_total_phi_weight() const { return total_phi_weight; }

	const vector<RankBoostSample>& get_samples() const { return samples; }
	const RankBoostSample& get_sample(int i) const { return samples[i]; }

	void  add_sample(const RankBoostSample& rbs) { samples.push_back(rbs); }

	void add_to_phi_vector(int x0_idx, int x1_idx, weight_t w = 1.0) 
	{
		phi_support.push_back(SamplePairWeight(x0_idx,x1_idx,w));
	}

	void add_to_phi_vector(int x0_idx, int x1_idx, int tag, weight_t w = 1.0) 
	{
		phi_support.push_back(SamplePairWeight(x0_idx,x1_idx,tag,w));
	}


	int   get_num_samples() const { return samples.size(); }

	const vector<int>& get_binary_one_list(int i) const { return binary_one_lists[i]; }

	const vector< vector<int> >& get_real_vote_lists(int feature_idx) const { return real_vote_lists[feature_idx]; }

	vector<SamplePairWeight>& get_non_const_phi_support() { return phi_support; }

	const vector<SamplePairWeight>& get_phi_support() const { return phi_support; }
	
	const vector<int>& get_binary_pairs_ordered_correctly(int f_idx) const { return binary_pairs_ordered_correctly[f_idx]; }
	
	const vector<int>& get_binary_pairs_ordered_incorrectly(int f_idx) const { return binary_pairs_ordered_incorrectly[f_idx]; }

	double update_dsitribution_according_to_binary_feature(int binary_feature_idx, 
								weight_t alpha, vector<weight_t>& D, bool verbose = false) const;

	double update_distribution_according_to_real_feature( 
		int best_real_idx, float theta_start,  float theta_end, 
		int q_def, weight_t alpha,  vector<weight_t>& D, 
		vector<weight_t> *max_D_for_normal_updates = NULL,
		bool verbose = false) const;

	int  get_num_groups() const { return num_groups; }
	void set_num_groups(int ng) { num_groups = ng; }
	
	void clear();

private:

	int	   num_groups;

	weight_t max_ratio_for_regular_update; // if a sample pair passes X times its intial weight
										   // then the increase in weight is regularized to avoid 
										   // over-training for a small number of samples
	double total_phi_weight;

	vector<RankBoostSample>  samples;

	vector<SamplePairWeight> phi_support; // all pairs of samples with weight > 0 (i.e., idx2 should be ranked higher than idx1)
														   // we perform the regular update when we com to increase the weight
	vector< vector<int> > ahead_lists;    // holds for each sample x, the idxs of samples x' for which x is ahead, that is (x',x) in phi
	vector< vector<int> > behind_lists;   // holds for each sample x, the idxs of samples x' for which x is behind, that is (x,x') in phi
	vector< vector<int> > binary_one_lists; // holds for each feature the idxs of the samples that have a 1 at that position

	vector< vector< vector<int> > > real_vote_lists; // holds the lists of feature idxs that vote on a given feature
	vector<bool>		  ind_all_samples_vote; // holds indicators that state for each real feature if all samples vote on it

	vector< vector<int> > binary_pairs_ordered_correctly;   // indices of phi of pairs (x',x) for which  f(x)=1 and f(x')=0 (or no vote)
	vector< vector<int> > binary_pairs_ordered_incorrectly; // indices of phi of pairs (x',x) for which  f(x)=0 (or no vote) and f(x')=1

	vector< map<int,float> > real_feature_values; // holds a table featuresXsamples of values (NEG_INF == no vote)
};



class RankBoostModel {
public:
	RankBoostModel() : ind_was_initialized(false), 
					   num_binary_features(0), 
					   num_real_features(0),
					   current_round(0),
					   train_error(1),
					   test_error(1),
					   total_default_weight(0) {};

	bool get_ind_was_initialized() const { return ind_was_initialized; }

	void init_rankboost_model_feature_names(
							   const vector<string>& _binary_feature_names,
							   const vector<string>& _real_feature_names);

	void init_rankboost_model_for_training(
							   const RankBoostDataset& training_ds,
							   int	 min_num_active_samples_for_feature = 50,
							   int	 max_num_real_bins_for_real_feature = 50);

	double calc_training_rank_error(const RankBoostDataset& training_ds) const;

	double calc_prediction_error(const RankBoostDataset& rank_ds, 
								 vector<peak_rank_stat>& peak_stats,
								 int peptide_length = -1,
								 int *num_groups = NULL) const;
									
	weight_t calc_rank_score(const RankBoostSample& sample) const;

	weight_t calc_rank_score_with_details(
						const RankBoostSample& sample,
						vector<string>& feature_names,
						vector<float>&	feature_values,
						vector<float>&  feature_scores) const;

	weight_t calc_rank_score_with_details(
						const RankBoostSample& sample,
						vector<int>  &  feature_idxs,
						vector<float>&	feature_values,
						vector<float>&  feature_scores) const;


	weight_t calc_rank_score(const RankBoostSample& sample, 
						  const vector<int>& active_binary_feature_idxs,
						  const vector<int>& active_real_feature_idxs) const;

	bool train_rankboost_model(const RankBoostDataset& training_ds,
							   int   max_num_rounds,
							   vector<idx_weight_pair>* top_misclassified_pairs= NULL,
							   RankBoostDataset* test_set = NULL,
							   int tag3_filter_val = -1,
							   char *report_preifx = NULL,
							   char *stop_signal_file = NULL,
							   const vector<string>* model_header_strings = NULL);

	void print_top_misclassified_pairs(const RankBoostDataset& training_ds,
									   const vector<weight_t>& D,
									   const vector<weight_t>& org_D,
									   int num_top_pairs = 10,
									   ostream& os = cout) const;

	void get_top_misclassified_pairs(
								   const RankBoostDataset& training_ds,
								   const vector<weight_t>& D,
								   const vector<weight_t>& D0,
								   vector<idx_weight_pair>& pair_idxs,
								   int num_top_pairs = -1) const;

	void summarize_features(const vector<RankBoostSample>& samples, ostream& os=cout); 

	void summarize_features_pos_neg(const vector<RankBoostSample>& pos_samples, 
									const vector<RankBoostSample>& neg_samples);

	
	bool read_rankboost_model(istream& is);

	// compresses the weights and limits before writing
	void write_rankboost_model(ostream& os, bool ind_compress = true);

	
	int  get_num_bins_for_real_feature(int real_feature_idx) const { return real_limits[real_feature_idx].size()+1; }

	// returns the index of the value (weight) idx for a given value
	// this is the index j for which our value is less or equal to theta_j
	// since the last theta is assumed to be infinity, the index |limits|
	// is given if our value is larger than all supplied limits
	// normally, the index 0 cannot be given since no value should be less than
	// -infinity which is the value in limits[0]
	int  get_real_bin_idx_for_value(int real_feature_idx, float val) const
	{
		const vector<float>& limits = real_limits[real_feature_idx];
		if (limits.size() == 0)
			return 0;

		int i;
		for (i=1; i<limits.size(); i++)
			if (val<=limits[i])
				break;
		return i;
	}

	int get_num_real_features() const { return num_real_features; }
	int get_num_binary_features() const { return num_binary_features; }
	const vector<weight_t>& get_binary_weights() const { return binary_weights; }
	const vector< vector<weight_t> >& get_real_weights() const { return real_weights; }
	const vector< vector<float> >&    get_real_limits() const  { return real_limits; }
	const vector<weight_t>&           get_real_default_weights() const { return real_default_weights; }
	const vector<string>* get_ptr_to_binary_feature_names() const { return &binary_feature_names; }
	const vector<string>* get_ptr_to_real_feature_names() const { return &real_feature_names; }
	const vector<string>& get_binary_feature_names() const { return binary_feature_names; }
	const vector<string>& get_real_feature_names() const { return real_feature_names; }
	const vector<int>& get_binary_update_counts() const { return binary_update_counts; }
	const vector<int>& get_real_update_counts() const { return real_update_counts; }

	void  set_real_feature_stage_idxs(const vector<int>& stage_idxs) { real_feature_stage_idxs = stage_idxs; }

	void ouput_ranked_feature_list( ostream& os = cout) const;

	void ouput_importance_ranked_feature_list( const RankBoostDataset& training_ds, 
											   ostream& os = cout,
											   int only_fidx = NEG_INF,
											   int round_idx = NEG_INF) ;


	void list_feature_vals_according_to_score(vector<RankBoostSample>& sams) const;


private:

	bool ind_was_initialized;

	int num_binary_features;
	int num_real_features;

	int	   current_round;
	double train_error, test_error;

	weight_t total_default_weight;

	vector<bool>	  ind_active_binary_feature;
	vector<weight_t>  binary_weights;
	vector<string>    binary_feature_names;
	
	vector<bool>	ind_active_real_feature;
	vector< vector<weight_t> > real_weights;  // feature / weights ( |limits|+1 values)
	vector< vector<float> > real_limits;
	vector<weight_t>        real_default_weights; // given to features that don't vote on a sample

	vector<string>			real_feature_names;
	vector<int>				real_feature_stage_idxs; // if set, these values are used to determine
													 // when to enter a feature into the training

	vector<int> non_zero_binary_idxs;
	vector<int> non_zero_real_idxs;

	// used in training
	vector<int> real_first_updates;
	vector<int> real_update_counts;   
	vector<int> binary_update_counts;

	// the values of the best round
	int best_round_idx;
	double best_train_error, best_test_error;
	double best_total_default_weight;
	vector<bool>	  best_ind_active_binary_feature;
	vector<weight_t>  best_binary_weights;
	vector<bool>			   best_ind_active_real_feature;
	vector< vector<weight_t> > best_real_weights;  // feature / weights ( |limits|+1 values)
	vector< vector<float> >	   best_real_limits;
	vector<weight_t>           best_real_default_weights; // given to features that don't vote on a sample
	vector<int> best_real_update_counts; 
	vector<int> best_binary_update_counts;
	vector<int> best_non_zero_binary_idxs;
	vector<int> best_non_zero_real_idxs;


	void select_active_features(const RankBoostDataset& training_ds,
								int min_sample_count = 50);

	void set_real_limits(const vector<RankBoostSample>& samples,
						 int min_bin_size = 50,
						 int max_num_bins = 200,
						 bool clear_weights = true);

	void binary_weak_learn(const vector<weight_t>& potentials,
						   const RankBoostDataset& rank_ds,
						   int&  best_feature_idx,
						   double& r_value,
						   bool only_non_zero_features = false) const;
	
	void real_weak_learn(const vector<weight_t>& potentials,
						 const RankBoostDataset& rank_ds,
						 int&  best_feature_idx,
						 int&  theta_idx,
						 int&  q_def,
						 double& r_value,
						 bool  only_non_zero_features = false) const;

	void real_weak_learn_double_theta(
						 const vector<weight_t>& potentials,
						 const RankBoostDataset& rank_ds,
						 int&  best_feature_idx,
						 int&  theta_start_idx, // x > theta
						 int&  theta_end_idx, // x<= theta
						 int&  q_def,
						 double& r_value,
						 bool  only_non_zero_features = false) const;


	void update_model_weights_for_binary_feature(int best_binary_idx, weight_t alpha);

	void update_model_weights_for_real_feature(weight_t alpha, int best_real_idx, 
								int q_def, int theata_idx_start, int theta_idx_end=-1);

	// compresses the number of limits and weights to represent steps in the weight
	// function without repeatition of the same weights in different limits
	void compress_real_limits_and_weights();

	void remove_default_weights();

	void set_best_model_parameters_to_current_parameters();

	void set_current_model_parameters_to_best_parameters();
};










#endif
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -