📄 rankboost.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
		// end bin size and mid bin size are the sizes that can be given to bins
		// of real values. The end bin values are larger to reduce incedents of overfitting
		// the end values.
		int bin_size = (min_num_samples_for_feature/2)+1;
		if (num_real_vals/max_num_bins>bin_size)
			bin_size = num_real_vals/max_num_bins;

		real_limits[i].push_back(NEG_INF);

		// go according to unique vales
		if (j==num_real_vals)
		{	
			int k;
			for (k=0; k<unique_vals.size(); k++)
			{
				if (k<unique_vals.size()-1 &&
					counts[k]<bin_size)
				{
					counts[k+1]+=counts[k];
					continue;
				}

				if (counts[k]>=bin_size)
					real_limits[i].push_back(unique_vals[k]);
			}

			if (real_limits[i].size()>1)
				real_limits[i].pop_back(); // remove last limit so it includes all values larger than it too
		}
		else
		{
			if (num_real_vals > min_num_samples_for_feature)
			{
				const int max_idx = num_real_vals - bin_size;
				int idx = bin_size;

				float last_limit = NEG_INF;
				while (idx<max_idx)
				{
					float new_limit = real_vals[i][idx];
					if (new_limit != last_limit)
					{
						real_limits[i].push_back(new_limit);
						last_limit = new_limit;
					}
					idx+=bin_size;
				}
				if (real_limits[i].size()>1)
					real_limits[i].pop_back(); // remove last limit so it includes all values larger than it too
			}
		}
	}


	if (clear_weights)
	{
		for (i=0; i<num_real_features; i++)
		{
			real_weights[i].clear();
			real_weights[i].resize(real_limits[i].size()+1,0);
		}
	}

	cout << "Set real limits for " << num_real_features << " real features..." << endl;
}


/************************************************************************
Chooses what features should be considered active (i.e., have their weights
updated in the boosting rounds). Binary features must have at least 
min_sample_count active samples for each feature
*************************************************************************/
void RankBoostModel::select_active_features(const RankBoostDataset& training_ds,
											int min_sample_count)
{

	ind_active_binary_feature.resize(num_binary_features,true);
	ind_active_real_feature.resize(num_real_features,true);

	const vector<RankBoostSample>& samples = training_ds.get_samples();
	const int num_samples                  = samples.size();

	vector<int> binary_one_counts,  binary_zero_counts, binary_inactive_counts;
	
	if (real_limits.size() != real_feature_names.size())
	{
		cout << "Error: must first set real feature limits!" << endl;
		exit(1);
	}

	binary_one_counts.resize(num_binary_features,0);
	binary_zero_counts.resize(num_binary_features,0);
	binary_inactive_counts.resize(num_binary_features,0);

	


	int i;
	for (i=0; i<num_samples; i++)
	{
		const RankBoostSample& sam = samples[i];
		vector<bool> used_inds;

		used_inds.resize(num_binary_features,false);

		int j;
		for (j=0; j< sam.binary_novote_idxs.size(); j++)
		{
			binary_inactive_counts[sam.binary_novote_idxs[j]]++;
			used_inds[sam.binary_novote_idxs[j]]=true;
		}
		
		for (j=0; j<sam.binary_non_zero_idxs.size(); j++)
		{
			binary_one_counts[sam.binary_non_zero_idxs[j]]++;
			used_inds[sam.binary_non_zero_idxs[j]]=true;
		}

		for (j=0; j<num_binary_features; j++)
			if (! used_inds[j])
				binary_zero_counts[j]++;
	}

	// inactiveate binary features
	for (i=0; i<num_binary_features; i++)
	{
		if (binary_one_counts[i]<min_sample_count ||
			binary_zero_counts[i]<min_sample_count)
		{
			ind_active_binary_feature[i]=false;
			cout << "Inactivating BINARY feature " << i << " " << this->binary_feature_names[i];
			cout << "  (counts 0: " << binary_zero_counts[i] << " 1: " <<
				binary_one_counts[i] << " nv: " << binary_inactive_counts[i] << ")" << endl;

		}
	}

	// inactivate real features with no ranges
	for (i=0; i<num_real_features; i++)
	{
		if (real_limits[i].size()==0)
		{
			ind_active_real_feature[i]=false;
			cout << "Inactivating REAL feature " << i << " " << real_feature_names[i] << endl;
		}
	}

}



/************************************************************************
Performs the weak learn function on the binary features.
Assumes that q_def is always 0 (so its selection is ignored).
The function assumes that the potentials have already been computed.
The function examines all features and returns the one for which |r| is
the largest.
*************************************************************************/
void RankBoostModel::binary_weak_learn(const vector<weight_t>& potentials,
						   const RankBoostDataset& rank_ds,
						   int&  best_feature_idx,
						   double& best_r_value,
						   bool only_non_zero_features) const
{
	best_feature_idx=-1;
	best_r_value = 0;
	
	if (only_non_zero_features && non_zero_binary_idxs.size() == 0)
		return;

	int nz_idx=0;
	int i;
	for (i=0; i<num_binary_features; i++)
	{
		if (only_non_zero_features)
		{
			if (nz_idx==non_zero_binary_idxs.size())
				return;

			if (i<non_zero_binary_idxs[nz_idx])
			{
				i=non_zero_binary_idxs[nz_idx];
				nz_idx++;
			}
		}

		if (! ind_active_binary_feature[i])
			continue;

		const vector<int>& active_idxs = rank_ds.get_binary_one_list(i);
		double r=0;
		int j;

		for (j=0; j<active_idxs.size(); j++)
			r+=potentials[active_idxs[j]];

		if (fabs(r) > fabs(best_r_value))
		{
			best_feature_idx=i;
			best_r_value = r;
		}
	}
}


/************************************************************************
Performs the weak learn function on the real functions.
he function assumes that the potentials have already been computed.
The function finds the feature and theta threshold for which |r| is maximized.
The function also returns the q_def (0 or 1) which should be given to samples
that did not vote on this feature (if q_def is 1 than the default weights 
for this feature should be updated accordingly).
*************************************************************************/
void RankBoostModel::real_weak_learn(const vector<weight_t>& potentials,
						 const RankBoostDataset& rank_ds,
						 int&  best_feature_idx,
						 int&  theta_idx,
						 int&  q_def,
						 double& best_r_value,
						 bool	only_non_zero_features) const
{
	const int num_samples = rank_ds.get_num_samples();

	best_feature_idx=0;
	theta_idx=-1;
	q_def=-1;
	best_r_value=0;

	int nz_idx =0;

	int i;
	for (i=0; i<num_real_features; i++)
	{
		if (only_non_zero_features)
		{
			if (nz_idx==non_zero_real_idxs.size())
				return;
			
			i=non_zero_real_idxs[nz_idx];
			nz_idx++;
		}

		if (! ind_active_real_feature[i])
			continue;

		const int num_thetas = real_limits[i].size()+1;
		double R=0;
		vector<double> L_bin_values;

		L_bin_values.resize(num_thetas,0);

		const vector< vector<int> >& lists = rank_ds.get_real_vote_lists(i);
		int j;
		for (j=0; j<lists.size(); j++)
		{
			const vector<int>& bin_list = lists[j];
			int k;
			for (k=0; k<bin_list.size(); k++)
				L_bin_values[j]+=potentials[bin_list[k]];

			R+=L_bin_values[j];
		}
		
		// the values in bin j are all greater than the limit of index j-1
		// therefore we return theta index j, and update all weights in positions
		// j and up. The first bin (j=0) is always empty (it has all values lower than
		// -infinity (empty), so it is not considered.

		double L=0;
		for (j=L_bin_values.size()-1; j>0; j--)
		{
			int q=0; // default for novote

			L+=L_bin_values[j];

			if (fabs(L)<=fabs(L-R))
				q=1;

			if (fabs(L-(double)q*R)>fabs(best_r_value))
			{
				best_feature_idx = i;
				best_r_value = L-(double)q*R;
				theta_idx = j;
				q_def = q;

		/*		if (fabs(best_r_value)>1)
				{
					cout << "Feature: " << i << "," << j << "  best_r_value: " << best_r_value << endl;
					int k;
					for (k=0; k<real_limits[i].size(); k++)
						cout << real_limits[i][k] << "\t";
					cout << endl;
					for (k=0; k<L_bin_values.size(); k++)
						cout << L_bin_values[k] << "\t";
					cout << endl;


					cout << endl << endl;
				}*/
			}
		}
	}
}


/************************************************************************
Performs the weak learn function on the real functions.
he function assumes that the potentials have already been computed.
The function finds the feature and theta threshold for which |r| is maximized.
The function also returns the q_def (0 or 1) which should be given to samples
that did not vote on this feature (if q_def is 1 than the default weights 
for this feature should be updated accordingly).
*************************************************************************/
void RankBoostModel::real_weak_learn_double_theta(const vector<weight_t>& potentials,
						 const RankBoostDataset& rank_ds,
						 int&  best_feature_idx,
						 int&  theta_start_idx,
						 int&  theta_end_idx,
						 int&  q_def,
						 double& best_r_value,
						 bool	only_non_zero_features) const
{
	const int num_samples = rank_ds.get_num_samples();

	best_feature_idx=0;
	theta_start_idx=-1;
	theta_end_idx =-1;
	q_def=-1;
	best_r_value=0;

	int nz_idx =0;

	int i;
	for (i=0; i<num_real_features; i++)
	{
		if (only_non_zero_features)
		{
			if (nz_idx==non_zero_real_idxs.size())
				return;
			
			i=non_zero_real_idxs[nz_idx];
			nz_idx++;
		}

		if (! ind_active_real_feature[i])
			continue;

		const int num_thetas = real_limits[i].size()+1;
		const int last_bin_idx = num_thetas - 1;
		double R=0;
		vector<double> L_bin_values;

		L_bin_values.resize(num_thetas,0);

		const vector< vector<int> >& lists = rank_ds.get_real_vote_lists(i);
		int j;
		for (j=0; j<lists.size(); j++)
		{
			const vector<int>& bin_list = lists[j];
			int k;
			for (k=0; k<bin_list.size(); k++)
				L_bin_values[j]+=potentials[bin_list[k]];

			R+=L_bin_values[j];
		}
		
		// the values in bin j are all greater than the limit of index j-1
		// therefore we return theta index j, and update all weights in positions
		// j and up. The first bin (j=0) is always empty (it has all values lower than
		// -infinity (empty), so it is not considered.

		double L=0;
		for (j=last_bin_idx; j>0; j--)
		{
			int q=0; // default for novote

			L+=L_bin_values[j];

			if (fabs(L)<=fabs(L-R))
				q=1;

			if (fabs(L-(double)q*R)>fabs(best_r_value))
			{
				best_feature_idx = i;
				best_r_value = L-(double)q*R;
				theta_start_idx = j;
				theta_end_idx   = last_bin_idx;
				q_def = q;
			}

			double L_prime = L;
			int k;
			for (k=last_bin_idx; k>j; k--)
			{
				L_prime -= L_bin_values[k];
				if (fabs(L_prime)<=fabs(L_prime-R))
					q=1;

				if (fabs(L_prime-(double)q*R)>fabs(best_r_value))
				{
					best_feature_idx = i;
					best_r_value = L_prime-(double)q*R;
					theta_start_idx = j;
					theta_end_idx   = k-1;
					q_def = q;
				}
			}
		}
	}
}


/************************************************************************
Performs the training of the rank boost model.
This function assumes that the training_ds was already intialized (has
phi_support computed already), and that the model has been initialized
for this dataset (feature names set, active features selected).
The algorithm performs a designated number of rounds (where each round is
consdiered an update of all active featrues).
If a test_set is given, reports about the test error are given for each round.
*************************************************************************/
bool RankBoostModel::train_rankboost_model(
							   const RankBoostDataset& training_ds,
							   int   max_num_rounds,
							   vector<idx_weight_pair>* top_misclassified_pairs,
							   RankBoostDataset* test_ds,
							   int   test_tag3_filter_val,
							   char *report_prefix,
							   char *stop_signal_file,
							   const vector<string>* model_header_strings) // these appear at the top of the boost model file
																		   // in case there is extra info not in the ordinary
																		   // boost models.
{
	fstream out_file_stream, stat_file_stream, train_res, test_res, flist_stream;
	
	bool use_cout = true;
	int running_feature_report_idx = -1; // if set to a feature idx, will give a report on the feature's progress

	// open streams for reporting
	if (report_prefix)
	{
		char buff[512];
		sprintf(buff,"%s_progress.txt",report_prefix);
		out_file_stream.open(buff,ios::out);
		if (! out_file_stream.is_open() || ! out_file_stream.good())
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -