📄 rankboostsample.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
}



/***********************************************************************
Creates lists of samples that vote in each of the real feature's values
************************************************************************/
void RankBoostDataset::initialize_real_vote_lists(const RankBoostModel& rbm)
{
	const int num_samples = samples.size();
	const int num_real_features = rbm.get_num_real_features();
	int i;
	
	real_vote_lists.clear();
	real_vote_lists.resize(num_real_features);
	
	ind_all_samples_vote.resize(num_real_features,false);

	vector< vector<int> > counts;
	counts.resize(num_real_features);

	for (i=0; i<num_real_features; i++)
		counts[i].resize(rbm.get_num_bins_for_real_feature(i),0);

	for (i=0; i<num_samples; i++)
	{
		const vector<int>&   idxs = samples[i].real_active_idxs;
		const vector<float>& vals = samples[i].real_active_values;
		int j;

		for (j=0; j<idxs.size(); j++)
		{
			const int bin_idx = rbm.get_real_bin_idx_for_value(idxs[j],vals[j]);
			counts[idxs[j]][bin_idx]++;
		}

	}

	// resize vectors according to counts
	int f;
	for (f=0; f<num_real_features; f++)
	{
		int total_counts=0;
		int i;
		for (i=0; i<counts[f].size(); i++)
			total_counts+=counts[f][i];

		real_vote_lists[f].resize(counts[f].size());
		
		for (i=0; i<counts[f].size(); i++)
			real_vote_lists[f][i].reserve(counts[f][i]);
	}

	for (i=0; i<num_samples; i++)
	{
		const vector<int>&   idxs = samples[i].real_active_idxs;
		const vector<float>& vals = samples[i].real_active_values;
		int j;

		for (j=0; j<idxs.size(); j++)
		{
			const int feature_idx = idxs[j];
			const int bin_idx = rbm.get_real_bin_idx_for_value(feature_idx,vals[j]);
			if (! ind_all_samples_vote[feature_idx])
				real_vote_lists[feature_idx][bin_idx].push_back(i);
		}
	}
}



/***********************************************************************
Calcs the potential of each sample x (this is the difference between
the weight of the pairs in which x is ahead minus the weight of the
samples in which x is behind).
************************************************************************/
void RankBoostDataset::calc_potentials(const vector<weight_t>& D, vector<weight_t>& potentials) const
{
	const int num_samples = samples.size();
	int i;
	potentials.resize(num_samples);
	for (i=0; i<num_samples; i++)
	{
		const vector<int>& ahead_list  = ahead_lists[i];
		const vector<int>& behind_list = behind_lists[i];
		weight_t potential = 0;
		
		int j;
		for (j=0; j<ahead_list.size(); j++)
			potential+=D[ahead_list[j]];

		for (j=0; j<behind_list.size(); j++)
			potential-=D[behind_list[j]];

		potentials[i]=potential;
	}
}


/***************************************************************************************
Updates the weights of samples affected by the current boosting round.
Weights of pairs correctly ordered should go down, and weights of pairs incorrectly 
ordered should go up.

The function returns the normalizing constant Z.
to avoid numerical stability issues, some of the summations are done in batches.
****************************************************************************************/
double RankBoostDataset::update_dsitribution_according_to_binary_feature(
											int binary_feature_idx,
											weight_t alpha,
											vector<weight_t>& D,
											bool verbose) const
{
	const vector<int>& correct_pairs = binary_pairs_ordered_correctly[binary_feature_idx];
	const vector<int>& incorrect_pairs = binary_pairs_ordered_incorrectly[binary_feature_idx];
	const weight_t norm_correct   = exp(-alpha);
	const weight_t norm_incorrect = exp(alpha);
	const int add_batch_size = 1000;

	double mod_weight_before =0;
	double mod_weight_after  =0;

	double batch_before  =0;
	double batch_after   =0;
	int batch_count =0;

	int i;
	for (i=0; i<correct_pairs.size(); i++)
	{
		weight_t& pair_weight = D[correct_pairs[i]];

		batch_before += pair_weight;
		pair_weight   *= norm_correct;
		batch_after  += pair_weight;

		batch_count++;
		if (batch_count == add_batch_size)
		{
			mod_weight_before += batch_before;
			mod_weight_after  += batch_after;
			batch_count=0;
			batch_before=0;
			batch_after=0;
		}
	}

	mod_weight_before += batch_before;
	mod_weight_after  += batch_after;

	batch_count=0;
	batch_before=0;
	batch_after=0;

	for (i=0; i<incorrect_pairs.size(); i++)
	{
		weight_t& pair_weight = D[incorrect_pairs[i]];

		batch_before += pair_weight;
		pair_weight   *= norm_incorrect;
		batch_after  += pair_weight;

		batch_count++;
		if (batch_count == add_batch_size)
		{
			mod_weight_before += batch_before;
			mod_weight_after  += batch_after;
			batch_count=0;
			batch_before=0;
			batch_after=0;
		}
	}

	mod_weight_before += batch_before;
	mod_weight_after  += batch_after;

	double total_weight_after = (1.0 - mod_weight_before) + mod_weight_after;

	return total_weight_after;
}


/************************************************************************************
Update the weights of samples affected in current boosting round.
Examines all samples (including no vote ones)
to avoid numerical stability issues, some of the summations are done in batches.

use POS_INF if there is no theta end
*************************************************************************************/
double RankBoostDataset::update_distribution_according_to_real_feature(
														int best_real_idx, 
														float theta_start, 
														float theta_end, 
														int q_def, 
														weight_t alpha, 
														vector<weight_t>& D,
														vector<weight_t> *max_D_for_normal_updates,
														bool verbose) const
{
	const map<int,float>& feature_values = real_feature_values[best_real_idx];
	const int add_batch_size = 1000;
	const weight_t norm_correct   = exp(-alpha);
	const weight_t norm_incorrect = exp(alpha);

	double mod_weight_before = 0; // the weight of the modified samples
	double mod_weight_after  = 0; //

	double batch_before  =0;
	double batch_after   =0;
	int num_diff = 0;
	int batch_count =0;
	int i;

	for (i=0; i<phi_support.size(); i++)
	{
		const int x0 = phi_support[i].idx1;
		const int x1 = phi_support[i].idx2;

		float x0_val = NEG_INF, x1_val = NEG_INF;
		bool x0_vote =true, x1_vote=true;
		
		map<int,float>::const_iterator it0 = feature_values.find(x0);
		map<int,float>::const_iterator it1 = feature_values.find(x1);

		if ( it0 != feature_values.end())
		{
			x0_val = it0->second;
		}
		else
			x0_vote = false;
		
		if ( it1 != feature_values.end())
		{
			x1_val = it1->second;
		}
		else
			x1_vote = false;

		int h0 = q_def, h1 = q_def;

		if (x0_vote)
			h0 = (x0_val>theta_start ? 1 : 0);
		
		if (x1_vote)
			h1 = (x1_val>theta_start ? 1 : 0);

		// check if the vals exceed the maximum theta (only if an upper bound is used
		if (x0_val > theta_end)
			h0 = 0;
		if (x1_val > theta_end)
			h1 = 0;
		
		if (h0 == h1)
			continue;

		num_diff++;

		weight_t& pair_weight = D[i];
		batch_before += pair_weight;

		if (! max_D_for_normal_updates || pair_weight < max_D_for_normal_updates->at(i))
		{
			pair_weight  *= (h1>0 ? norm_correct : norm_incorrect);
		//	cout << pair_weight << " <-> " << max_D_for_normal_updates->at(i) << endl;
		}
		else
		{
			double update_weight = (h1>0 ? norm_correct : norm_incorrect);
			
			if (update_weight<=1.0)
			{
				pair_weight *= update_weight;
			}
			else // this update is not as strong
			{
				double delta = update_weight - 1.0;
				delta *= (max_D_for_normal_updates->at(i)/pair_weight);
				pair_weight *= (1.0+delta);
			}
		}
	
		batch_after  += pair_weight;
	
		batch_count++;
		if (batch_count == add_batch_size)
		{
			mod_weight_before += batch_before;
			mod_weight_after  += batch_after;
			batch_count=0;
			batch_before=0;
			batch_after=0;
		}
	}

	mod_weight_before += batch_before;
	mod_weight_after  += batch_after;

	double total_weight_after = (1.0 - mod_weight_before) + mod_weight_after;

	if (verbose)
	{
		cout << setprecision(7);
		cout << "Theta: " << theta_start;
		if (theta_end<POS_INF) 
			cout << "-" << theta_end;
		cout << " #diff " << num_diff << endl;
		cout << "mod weight " << mod_weight_before << " -> " << mod_weight_after << "  Z=" << total_weight_after << endl;

	}

	return total_weight_after;
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -