📄 rankboost.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:

	for (i=0; i<sample.real_active_idxs.size(); i++)
	{
		const int feature_idx = sample.real_active_idxs[i];
		
		if (real_weights[feature_idx].size()==0) // inactive feature
			continue;

		const int bin_idx = get_real_bin_idx_for_value(feature_idx,sample.real_active_values[i]);
		score += real_weights[feature_idx][bin_idx];
		score -= real_default_weights[feature_idx]; // remove novote scores that shouldn't have been added
	}

	return score;
}


/*************************************************************************
**************************************************************************/
weight_t RankBoostModel::calc_rank_score_with_details(
						const RankBoostSample& sample,
						vector<string>& feature_names,
						vector<float>&	feature_values,
						vector<float>&  feature_scores) const
{
	const vector<int>& binary_idxs = sample.binary_non_zero_idxs;

	feature_names.clear();
	feature_scores.clear();
	weight_t score=0;

	if (! ind_was_initialized)
	{
		cout << "Error: RankBoostModel not initialized!" << endl;
		exit(1);
	}

	int i;
	for (i=0; i<binary_idxs.size(); i++)
		score += binary_weights[binary_idxs[i]];

	// first add all no vote scores
	score += total_default_weight;

	feature_names.push_back("defaults");
	feature_values.push_back(0);
	feature_scores.push_back(score);

	for (i=0; i<sample.real_active_idxs.size(); i++)
	{
		const int feature_idx = sample.real_active_idxs[i];
		
		if (real_weights[feature_idx].size()==0) // inactive feature
			continue;

		const int bin_idx = get_real_bin_idx_for_value(feature_idx,sample.real_active_values[i]);
		score += real_weights[feature_idx][bin_idx];
		score -= real_default_weights[feature_idx]; // remove novote scores that shouldn't have been added

		score_t net_score = real_weights[feature_idx][bin_idx] - real_default_weights[feature_idx];

		if (net_score != 0)
		{
			feature_names.push_back(real_feature_names[feature_idx]);
			feature_values.push_back(sample.real_active_values[i]);
			feature_scores.push_back(net_score);
		//	cout << feature_idx << "\t" << real_feature_names[feature_idx] << "\t" << net_score << endl;
		}
	//	cout << endl;
	}

	return score;
}


/*************************************************************************
**************************************************************************/
weight_t RankBoostModel::calc_rank_score_with_details(
						const RankBoostSample& sample,
						vector<int>  &	feature_idxs,
						vector<float>&	feature_values,
						vector<float>&  feature_scores) const
{
	const vector<int>& binary_idxs = sample.binary_non_zero_idxs;

	feature_idxs.clear();
	feature_values.clear();
	feature_scores.clear();
	weight_t score=0;

	if (! ind_was_initialized)
	{
		cout << "Error: RankBoostModel not initialized!" << endl;
		exit(1);
	}

	int i;
	for (i=0; i<binary_idxs.size(); i++)
		score += binary_weights[binary_idxs[i]];

	// first add all no vote scores
	score += total_default_weight;

	for (i=0; i<sample.real_active_idxs.size(); i++)
	{
		const int feature_idx = sample.real_active_idxs[i];
		
		if (real_weights[feature_idx].size()==0) // inactive feature
			continue;

		const int bin_idx = get_real_bin_idx_for_value(feature_idx,sample.real_active_values[i]);
		score += real_weights[feature_idx][bin_idx];
		score -= real_default_weights[feature_idx]; // remove novote scores that shouldn't have been added

		score_t net_score = real_weights[feature_idx][bin_idx] - real_default_weights[feature_idx];

		if (net_score != 0)
		{
			feature_idxs.push_back(feature_idx);
			feature_values.push_back(sample.real_active_values[i]);
			feature_scores.push_back(net_score);
		}
	}

	return score;
}



/*************************************************************************
Initializes a rank boost model 
**************************************************************************/
void RankBoostModel::init_rankboost_model_feature_names(	
							   const vector<string>& _binary_feature_names,
							   const vector<string>& _real_feature_names)
{
	binary_feature_names = _binary_feature_names;
	real_feature_names   = _real_feature_names;

	num_binary_features = binary_feature_names.size();
	num_real_features   = real_feature_names.size();

	real_default_weights.resize(num_real_features,0);
	real_weights.resize(num_real_features);
	real_limits.clear();
	
	binary_weights.clear();
	binary_weights.resize(num_binary_features,0);

	non_zero_binary_idxs.clear();
	non_zero_real_idxs.clear();

}


/***********************************************************************
Initializes the list of active features and limits for the real valued
features according to the data provided by the training_set.
Each active binary feature must have the given minimum number of active
samples per feature. Each bin in the real valued feature must also have
at least the same number of samples in it.
************************************************************************/
void RankBoostModel::init_rankboost_model_for_training(
						const RankBoostDataset& training_ds,
						int	 min_num_active_samples_for_feature,
						int	 max_num_real_bins_for_real_feature)
{

//	cout << "set limits..." << endl;
	set_real_limits(training_ds.get_samples(), 
					min_num_active_samples_for_feature, 
					max_num_real_bins_for_real_feature);

//	cout << "select active..." << endl;
	select_active_features(training_ds, min_num_active_samples_for_feature);
}




void RankBoostModel::summarize_features(const vector<RankBoostSample>& samples,
										ostream& os)
{
	const int num_samples = samples.size();
	vector<int> inactive_binary_count,  non_zero_binary_count;
	vector<int> inactive_real_count, non_zero_real_count;
	vector< vector<float> > real_vals;

	inactive_binary_count.resize(num_binary_features,0);
	non_zero_binary_count.resize(num_binary_features,0);

	inactive_real_count.resize(num_real_features,0);
	non_zero_real_count.resize(num_real_features,0);

	real_vals.resize(num_real_features);

	int i;
	for (i=0; i<num_samples; i++)
	{
		const RankBoostSample& sam = samples[i];

		int j;
		for (j=0; j<sam.binary_novote_idxs.size(); j++)
		{
			sam.print(os);
			inactive_binary_count[sam.binary_novote_idxs[j]]++;
		}

		for (j=0; j<sam.binary_non_zero_idxs.size(); j++)
			non_zero_binary_count[sam.binary_non_zero_idxs[j]]++;

		for (j=0; j<sam.real_novote_idxs.size(); j++)
		{
			inactive_real_count[sam.real_novote_idxs[j]]++;
			sam.print(os);
		}

		for (j=0; j<sam.real_active_idxs.size(); j++)
		{
			non_zero_real_count[sam.real_active_idxs[j]]++;
			real_vals[sam.real_active_idxs[j]].push_back(sam.real_active_values[j]);
		}
	}

	if (binary_feature_names.size()>0)
	{
		os << "REPORT ON BINARY FEATURES:" << endl;
		os << "--------------------------" << endl;
		for (i=0; i<binary_feature_names.size(); i++)
		{
			if (inactive_binary_count[i]<num_samples)
			{
				os << i << "\t" << 1.0 - inactive_binary_count[i]/(double)num_samples << "\t";
				os << non_zero_binary_count[i] << "\t" << non_zero_binary_count[i]/(double)(num_samples-inactive_binary_count[i]) << "\t";
				os << binary_feature_names[i] << endl;
			}
		}
		os << endl;
	}
	os << "REPORT ON REAL FEATURES:" << endl;
	os << "------------------------" << endl;
	for (i=0; i<real_feature_names.size(); i++)
	{
		if (inactive_real_count[i]<num_samples-10)
		{
			os << i << "\t" << 1.0 - inactive_real_count[i]/(double)num_samples << "\t";
			os << non_zero_real_count[i] << "\t" << non_zero_real_count[i]/(double)(num_samples-inactive_real_count[i]) << "\t";
			os << real_feature_names[i] << endl;
			if (real_vals[i].size()>20)
			{
				sort(real_vals[i].begin(),real_vals[i].end());
				int s_inc = real_vals[i].size()/10;
				int j;
				for (j=0; j<10; j++)
					os << real_vals[i][j*s_inc] << " ";
				os << endl;
			}
			os << endl;

	/*		if (i == 1 )
			{
				cout << "FEATURE " << i << endl;
				int j;
				for (j=0; j<real_vals[i].size(); j++)
					cout << real_vals[i][j] << "\t";
				cout << endl;
			}*/
		}
	}
	cout << endl;
}


/******************************************************************************
*******************************************************************************/
void RankBoostModel::summarize_features_pos_neg(
									const vector<RankBoostSample>& pos_samples, 
									const vector<RankBoostSample>& neg_samples)
{
	const int num_pos_samples = pos_samples.size();
	const int num_neg_samples = neg_samples.size();

	vector<int> pos_inactive_real_count, pos_non_zero_real_count;
	vector<int> neg_inactive_real_count, neg_non_zero_real_count;
	vector< vector<float> > pos_real_vals, neg_real_vals;


	pos_inactive_real_count.resize(num_real_features,0);
	pos_non_zero_real_count.resize(num_real_features,0);
	neg_inactive_real_count.resize(num_real_features,0);
	neg_non_zero_real_count.resize(num_real_features,0);

	pos_real_vals.resize(num_real_features);
	neg_real_vals.resize(num_real_features);

	int i,j;
	for (i=0; i<num_pos_samples; i++)
	{
		const RankBoostSample& pos_sam = pos_samples[i];

		for (j=0; j<pos_sam.real_active_idxs.size(); j++)
		{
			pos_non_zero_real_count[pos_sam.real_active_idxs[j]]++;
			pos_real_vals[pos_sam.real_active_idxs[j]].push_back(pos_sam.real_active_values[j]);
		}
	}

	for (i=0; i<num_neg_samples; i++)
	{
		const RankBoostSample& neg_sam = neg_samples[i];

		for (j=0; j<neg_sam.real_active_idxs.size(); j++)
		{
			neg_non_zero_real_count[neg_sam.real_active_idxs[j]]++;
			neg_real_vals[neg_sam.real_active_idxs[j]].push_back(neg_sam.real_active_values[j]);
		}
	}


	cout << "REPORT ON REAL FEATURES:" << endl;
	cout << "------------------------" << endl;
	for (i=0; i<real_feature_names.size(); i++)
	{
		if (pos_inactive_real_count[i]>=num_pos_samples-10 &&
			neg_inactive_real_count[i]>=num_neg_samples-10)
			continue;

		if (pos_inactive_real_count[i]<num_pos_samples-10)
		{
			cout << i <<  "\t";
			cout << pos_non_zero_real_count[i] << "\t" << pos_non_zero_real_count[i]/(double)(num_pos_samples-pos_inactive_real_count[i]) << "\t";
			cout << neg_non_zero_real_count[i] << "\t" << neg_non_zero_real_count[i]/(double)(num_neg_samples-neg_inactive_real_count[i]) << "\t";
			cout << real_feature_names[i] << endl;
			if (pos_real_vals[i].size()>20)
			{
				cout << "POS: ";
				sort(pos_real_vals[i].begin(),pos_real_vals[i].end());
				int s_inc = pos_real_vals[i].size()/10;
				int j;
				for (j=0; j<10; j++)
					cout << pos_real_vals[i][j*s_inc] << " ";
				cout << endl;
			}
		}
		else
			cout << i<< " not enough pos" << endl;

		if (neg_inactive_real_count[i]<num_neg_samples-10)
		{
			if (neg_real_vals[i].size()>20)
			{
				cout << "NEG: ";
				sort(neg_real_vals[i].begin(),neg_real_vals[i].end());
				int s_inc = neg_real_vals[i].size()/10;
				int j;
				for (j=0; j<10; j++)
					cout << neg_real_vals[i][j*s_inc] << " ";
				cout << endl;
			}
		}
		else
			cout << "Not enough neg" << endl;
		cout << endl;
	}
	cout << endl;
}



/******************************************************************
Chooses what values to use as limits for the real valued features.
If there is only one dominant value, or the number of samples for
which the feature is active is too small, then the feature gets
deactivated.
*******************************************************************/
void RankBoostModel::set_real_limits(const vector<RankBoostSample>& samples,
									 int min_num_samples_for_feature,
									 int max_num_bins,
									 bool clear_weights)
{
	const int num_samples = samples.size();
	vector< vector<float> > real_vals;

	real_vals.clear();
	real_limits.clear();

	real_vals.resize(num_real_features);
	real_limits.resize(num_real_features);

	int i;
	for (i=0; i<num_real_features; i++)
		real_vals[i].clear();

	for (i=0; i<num_samples; i++)
	{
		const RankBoostSample& sam = samples[i];

		int j;
	
		for (j=0; j<sam.real_active_idxs.size(); j++)
		{
			const int f_idx = sam.real_active_idxs[j];
			real_vals[f_idx].push_back(sam.real_active_values[j]);
		}
	}


	for (i=0; i<num_real_features; i++)
	{
		real_limits[i].clear();
		if (real_vals.size() <min_num_samples_for_feature )
			continue;

		sort(real_vals[i].begin(),real_vals[i].end());

		// find number of unique values. If it is below max_num_bins, then 
		// make sure there are at least min_bin_size samples per bin. If there are
		// more than that number, split into equal sized bins

		vector<float> unique_vals;
		vector<int>   counts;
		const int num_real_vals = real_vals[i].size();

		int j;
		for (j=0; j<num_real_vals; j++)
		{
			const float val = real_vals[i][j];
			int k;
			for (k=0; k<unique_vals.size(); k++)
				if (val == unique_vals[k])
				{
					counts[k]++;
					break;
				}

			if (k<unique_vals.size())
				continue;
			
			unique_vals.push_back(val);
			counts.push_back(1);

			if (unique_vals.size()>max_num_bins)
				break;
		}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -