📄 rankboost.cpp
字号:
// end bin size and mid bin size are the sizes that can be given to bins
// of real values. The end bin values are larger to reduce incedents of overfitting
// the end values.
int bin_size = (min_num_samples_for_feature/2)+1;
if (num_real_vals/max_num_bins>bin_size)
bin_size = num_real_vals/max_num_bins;
real_limits[i].push_back(NEG_INF);
// go according to unique vales
if (j==num_real_vals)
{
int k;
for (k=0; k<unique_vals.size(); k++)
{
if (k<unique_vals.size()-1 &&
counts[k]<bin_size)
{
counts[k+1]+=counts[k];
continue;
}
if (counts[k]>=bin_size)
real_limits[i].push_back(unique_vals[k]);
}
if (real_limits[i].size()>1)
real_limits[i].pop_back(); // remove last limit so it includes all values larger than it too
}
else
{
if (num_real_vals > min_num_samples_for_feature)
{
const int max_idx = num_real_vals - bin_size;
int idx = bin_size;
float last_limit = NEG_INF;
while (idx<max_idx)
{
float new_limit = real_vals[i][idx];
if (new_limit != last_limit)
{
real_limits[i].push_back(new_limit);
last_limit = new_limit;
}
idx+=bin_size;
}
if (real_limits[i].size()>1)
real_limits[i].pop_back(); // remove last limit so it includes all values larger than it too
}
}
}
if (clear_weights)
{
for (i=0; i<num_real_features; i++)
{
real_weights[i].clear();
real_weights[i].resize(real_limits[i].size()+1,0);
}
}
cout << "Set real limits for " << num_real_features << " real features..." << endl;
}
/************************************************************************
Chooses what features should be considered active (i.e., have their weights
updated in the boosting rounds). Binary features must have at least
min_sample_count active samples for each feature
*************************************************************************/
void RankBoostModel::select_active_features(const RankBoostDataset& training_ds,
int min_sample_count)
{
ind_active_binary_feature.resize(num_binary_features,true);
ind_active_real_feature.resize(num_real_features,true);
const vector<RankBoostSample>& samples = training_ds.get_samples();
const int num_samples = samples.size();
vector<int> binary_one_counts, binary_zero_counts, binary_inactive_counts;
if (real_limits.size() != real_feature_names.size())
{
cout << "Error: must first set real feature limits!" << endl;
exit(1);
}
binary_one_counts.resize(num_binary_features,0);
binary_zero_counts.resize(num_binary_features,0);
binary_inactive_counts.resize(num_binary_features,0);
int i;
for (i=0; i<num_samples; i++)
{
const RankBoostSample& sam = samples[i];
vector<bool> used_inds;
used_inds.resize(num_binary_features,false);
int j;
for (j=0; j< sam.binary_novote_idxs.size(); j++)
{
binary_inactive_counts[sam.binary_novote_idxs[j]]++;
used_inds[sam.binary_novote_idxs[j]]=true;
}
for (j=0; j<sam.binary_non_zero_idxs.size(); j++)
{
binary_one_counts[sam.binary_non_zero_idxs[j]]++;
used_inds[sam.binary_non_zero_idxs[j]]=true;
}
for (j=0; j<num_binary_features; j++)
if (! used_inds[j])
binary_zero_counts[j]++;
}
// inactiveate binary features
for (i=0; i<num_binary_features; i++)
{
if (binary_one_counts[i]<min_sample_count ||
binary_zero_counts[i]<min_sample_count)
{
ind_active_binary_feature[i]=false;
cout << "Inactivating BINARY feature " << i << " " << this->binary_feature_names[i];
cout << " (counts 0: " << binary_zero_counts[i] << " 1: " <<
binary_one_counts[i] << " nv: " << binary_inactive_counts[i] << ")" << endl;
}
}
// inactivate real features with no ranges
for (i=0; i<num_real_features; i++)
{
if (real_limits[i].size()==0)
{
ind_active_real_feature[i]=false;
cout << "Inactivating REAL feature " << i << " " << real_feature_names[i] << endl;
}
}
}
/************************************************************************
Performs the weak learn function on the binary features.
Assumes that q_def is always 0 (so its selection is ignored).
The function assumes that the potentials have already been computed.
The function examines all features and returns the one for which |r| is
the largest.
*************************************************************************/
void RankBoostModel::binary_weak_learn(const vector<weight_t>& potentials,
const RankBoostDataset& rank_ds,
int& best_feature_idx,
double& best_r_value,
bool only_non_zero_features) const
{
best_feature_idx=-1;
best_r_value = 0;
if (only_non_zero_features && non_zero_binary_idxs.size() == 0)
return;
int nz_idx=0;
int i;
for (i=0; i<num_binary_features; i++)
{
if (only_non_zero_features)
{
if (nz_idx==non_zero_binary_idxs.size())
return;
if (i<non_zero_binary_idxs[nz_idx])
{
i=non_zero_binary_idxs[nz_idx];
nz_idx++;
}
}
if (! ind_active_binary_feature[i])
continue;
const vector<int>& active_idxs = rank_ds.get_binary_one_list(i);
double r=0;
int j;
for (j=0; j<active_idxs.size(); j++)
r+=potentials[active_idxs[j]];
if (fabs(r) > fabs(best_r_value))
{
best_feature_idx=i;
best_r_value = r;
}
}
}
/************************************************************************
Performs the weak learn function on the real functions.
he function assumes that the potentials have already been computed.
The function finds the feature and theta threshold for which |r| is maximized.
The function also returns the q_def (0 or 1) which should be given to samples
that did not vote on this feature (if q_def is 1 than the default weights
for this feature should be updated accordingly).
*************************************************************************/
void RankBoostModel::real_weak_learn(const vector<weight_t>& potentials,
const RankBoostDataset& rank_ds,
int& best_feature_idx,
int& theta_idx,
int& q_def,
double& best_r_value,
bool only_non_zero_features) const
{
const int num_samples = rank_ds.get_num_samples();
best_feature_idx=0;
theta_idx=-1;
q_def=-1;
best_r_value=0;
int nz_idx =0;
int i;
for (i=0; i<num_real_features; i++)
{
if (only_non_zero_features)
{
if (nz_idx==non_zero_real_idxs.size())
return;
i=non_zero_real_idxs[nz_idx];
nz_idx++;
}
if (! ind_active_real_feature[i])
continue;
const int num_thetas = real_limits[i].size()+1;
double R=0;
vector<double> L_bin_values;
L_bin_values.resize(num_thetas,0);
const vector< vector<int> >& lists = rank_ds.get_real_vote_lists(i);
int j;
for (j=0; j<lists.size(); j++)
{
const vector<int>& bin_list = lists[j];
int k;
for (k=0; k<bin_list.size(); k++)
L_bin_values[j]+=potentials[bin_list[k]];
R+=L_bin_values[j];
}
// the values in bin j are all greater than the limit of index j-1
// therefore we return theta index j, and update all weights in positions
// j and up. The first bin (j=0) is always empty (it has all values lower than
// -infinity (empty), so it is not considered.
double L=0;
for (j=L_bin_values.size()-1; j>0; j--)
{
int q=0; // default for novote
L+=L_bin_values[j];
if (fabs(L)<=fabs(L-R))
q=1;
if (fabs(L-(double)q*R)>fabs(best_r_value))
{
best_feature_idx = i;
best_r_value = L-(double)q*R;
theta_idx = j;
q_def = q;
/* if (fabs(best_r_value)>1)
{
cout << "Feature: " << i << "," << j << " best_r_value: " << best_r_value << endl;
int k;
for (k=0; k<real_limits[i].size(); k++)
cout << real_limits[i][k] << "\t";
cout << endl;
for (k=0; k<L_bin_values.size(); k++)
cout << L_bin_values[k] << "\t";
cout << endl;
cout << endl << endl;
}*/
}
}
}
}
/************************************************************************
Performs the weak learn function on the real functions.
he function assumes that the potentials have already been computed.
The function finds the feature and theta threshold for which |r| is maximized.
The function also returns the q_def (0 or 1) which should be given to samples
that did not vote on this feature (if q_def is 1 than the default weights
for this feature should be updated accordingly).
*************************************************************************/
void RankBoostModel::real_weak_learn_double_theta(const vector<weight_t>& potentials,
const RankBoostDataset& rank_ds,
int& best_feature_idx,
int& theta_start_idx,
int& theta_end_idx,
int& q_def,
double& best_r_value,
bool only_non_zero_features) const
{
const int num_samples = rank_ds.get_num_samples();
best_feature_idx=0;
theta_start_idx=-1;
theta_end_idx =-1;
q_def=-1;
best_r_value=0;
int nz_idx =0;
int i;
for (i=0; i<num_real_features; i++)
{
if (only_non_zero_features)
{
if (nz_idx==non_zero_real_idxs.size())
return;
i=non_zero_real_idxs[nz_idx];
nz_idx++;
}
if (! ind_active_real_feature[i])
continue;
const int num_thetas = real_limits[i].size()+1;
const int last_bin_idx = num_thetas - 1;
double R=0;
vector<double> L_bin_values;
L_bin_values.resize(num_thetas,0);
const vector< vector<int> >& lists = rank_ds.get_real_vote_lists(i);
int j;
for (j=0; j<lists.size(); j++)
{
const vector<int>& bin_list = lists[j];
int k;
for (k=0; k<bin_list.size(); k++)
L_bin_values[j]+=potentials[bin_list[k]];
R+=L_bin_values[j];
}
// the values in bin j are all greater than the limit of index j-1
// therefore we return theta index j, and update all weights in positions
// j and up. The first bin (j=0) is always empty (it has all values lower than
// -infinity (empty), so it is not considered.
double L=0;
for (j=last_bin_idx; j>0; j--)
{
int q=0; // default for novote
L+=L_bin_values[j];
if (fabs(L)<=fabs(L-R))
q=1;
if (fabs(L-(double)q*R)>fabs(best_r_value))
{
best_feature_idx = i;
best_r_value = L-(double)q*R;
theta_start_idx = j;
theta_end_idx = last_bin_idx;
q_def = q;
}
double L_prime = L;
int k;
for (k=last_bin_idx; k>j; k--)
{
L_prime -= L_bin_values[k];
if (fabs(L_prime)<=fabs(L_prime-R))
q=1;
if (fabs(L_prime-(double)q*R)>fabs(best_r_value))
{
best_feature_idx = i;
best_r_value = L_prime-(double)q*R;
theta_start_idx = j;
theta_end_idx = k-1;
q_def = q;
}
}
}
}
}
/************************************************************************
Performs the training of the rank boost model.
This function assumes that the training_ds was already intialized (has
phi_support computed already), and that the model has been initialized
for this dataset (feature names set, active features selected).
The algorithm performs a designated number of rounds (where each round is
consdiered an update of all active featrues).
If a test_set is given, reports about the test error are given for each round.
*************************************************************************/
bool RankBoostModel::train_rankboost_model(
const RankBoostDataset& training_ds,
int max_num_rounds,
vector<idx_weight_pair>* top_misclassified_pairs,
RankBoostDataset* test_ds,
int test_tag3_filter_val,
char *report_prefix,
char *stop_signal_file,
const vector<string>* model_header_strings) // these appear at the top of the boost model file
// in case there is extra info not in the ordinary
// boost models.
{
fstream out_file_stream, stat_file_stream, train_res, test_res, flist_stream;
bool use_cout = true;
int running_feature_report_idx = -1; // if set to a feature idx, will give a report on the feature's progress
// open streams for reporting
if (report_prefix)
{
char buff[512];
sprintf(buff,"%s_progress.txt",report_prefix);
out_file_stream.open(buff,ios::out);
if (! out_file_stream.is_open() || ! out_file_stream.good())
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -