📄 rankboost.cpp
字号:
{
cout << "Error: coudln't open out_file for wrtiting: " << buff << endl;
exit(1);
}
sprintf(buff,"%s_stats.txt",report_prefix);
stat_file_stream.open(buff,ios::out);
if (! stat_file_stream.is_open() || ! stat_file_stream.good())
{
cout << "Error: coudln't open test stats for wrtiting: " << buff << endl;
exit(1);
}
sprintf(buff,"%s_train_res.txt",report_prefix);
train_res.open(buff,ios::out);
if (! train_res.is_open() || ! train_res.good())
{
cout << "Error: coudln't open train res for wrtiing: " << buff << endl;
exit(1);
}
if (test_ds)
{
sprintf(buff,"%s_test_res.txt",report_prefix);
test_res.open(buff,ios::out);
if (! test_res.is_open() || ! test_res.good())
{
cout << "Error: coudln't open test res for writing: " << buff << endl;
exit(1);
}
}
use_cout = false;
}
ostream& out_stream = (use_cout ? cout : out_file_stream);
ostream& stat_stream = (use_cout ? cout : stat_file_stream);
vector<weight_t> D,D0, max_D_for_normal_updates;
vector<weight_t> *p_max_D_for_normal_updates = NULL;
training_ds.compute_initial_distribution(D0);
if (training_ds.get_max_ratio_for_regular_update()<9999)
{
training_ds.compute_max_weights_for_normal_update(D0,max_D_for_normal_updates);
p_max_D_for_normal_updates = &max_D_for_normal_updates;
}
real_first_updates.resize(num_real_features,NEG_INF);
real_update_counts.resize(num_real_features,0);
binary_update_counts.resize(num_binary_features,0);
D=D0;
double Z_prod = 1.0;
total_default_weight =0;
best_test_error = 1.0;
best_train_error = 1.0;
this->ind_was_initialized = true;
time_t start_time = time(NULL);
cout << "Running boosting for at most " << max_num_rounds << " iterations..." << endl;
int t;
for (t=1; t<=max_num_rounds; t++)
{
current_round = t;
// determine how often we report progress and test training/test error
int report_freq = 1;
if (t>10) report_freq=10;
if (t>500) report_freq=50;
if (t>1000) report_freq=100;
if (t>5000) report_freq=500;
if (t>10000) report_freq=1000;
if (t>100000) report_freq=5000;
if (t>500000) report_freq=10000;
const int feature_report_rounds = 10000;
bool report_this_round = ((t % report_freq) == 0);
bool use_double_theta = true;
bool ind_only_non_zero_features = false;
if (t>100 && t%5) ind_only_non_zero_features = true;
if (t>1000 && t%10) ind_only_non_zero_features = true;
if (t>10000 && t%25) ind_only_non_zero_features = true;
if (t>100000 && t%100) ind_only_non_zero_features = true;
int best_binary_idx=-1;
double best_binary_r = 0;
int real_feature_idx=-1;
int real_theta_start_idx = -1;
int real_theta_end_idx = -1;
int real_q_def = -1;
double real_r = 0;
vector<weight_t> potentials;
training_ds.calc_potentials(D,potentials);
if (num_binary_features>0)
binary_weak_learn(potentials,training_ds,best_binary_idx,best_binary_r);
if (num_real_features>0)
{
if (use_double_theta)
{
real_weak_learn_double_theta(potentials, training_ds, real_feature_idx,
real_theta_start_idx, real_theta_end_idx,
real_q_def, real_r, ind_only_non_zero_features );
}
else
real_weak_learn(potentials, training_ds, real_feature_idx,
real_theta_start_idx, real_q_def, real_r, ind_only_non_zero_features );
}
if (real_r == 0 && best_binary_r == 0)
{
if (t<5)
{
cout << "Error: model converged to quickly, there is a problem with the feature values!" << endl;
exit(1);
}
break;
}
double Z=1.0;
if (fabs(real_r)>fabs(best_binary_r))
{
const weight_t alpha = 0.5 * log((1+real_r)/(1-real_r));
const float theta_start = real_limits[real_feature_idx][real_theta_start_idx-1];
const float theta_end = ( (real_theta_end_idx>0 &&
real_theta_end_idx<real_limits[real_feature_idx].size())
? real_limits[real_feature_idx][real_theta_end_idx] :
POS_INF);
Z=training_ds.update_distribution_according_to_real_feature(real_feature_idx,
theta_start, theta_end, real_q_def,alpha, D , p_max_D_for_normal_updates, false);
update_model_weights_for_real_feature(alpha, real_feature_idx, real_q_def,
real_theta_start_idx, real_theta_end_idx);
if (real_first_updates[real_feature_idx]<0)
real_first_updates[real_feature_idx]=t;
}
else
{
const weight_t alpha = 0.5 * log((1+best_binary_r)/(1-best_binary_r));
Z=training_ds.update_dsitribution_according_to_binary_feature(best_binary_idx, alpha, D);
update_model_weights_for_binary_feature(best_binary_idx, alpha);
}
Z_prod *= Z;
// This part is for the feature reports / not really part of the trianing
if (running_feature_report_idx>=0 &&
real_feature_idx == running_feature_report_idx)
{
const int num_updates = real_update_counts[running_feature_report_idx];
int sqr=1;
while (sqr<num_updates)
sqr*=2;
if (sqr == num_updates ) // output only for 1,2,4,8,... updates
{
if (! flist_stream.is_open())
{
char buff[512];
sprintf(buff,"%s_fprog_%d.txt",report_prefix,running_feature_report_idx);
flist_stream.open(buff,ios::out);
if (! flist_stream.is_open() || ! flist_stream.good())
{
cout << "Error: coudln't open flist for writing: " << buff << endl;
exit(1);
}
}
ouput_importance_ranked_feature_list(training_ds,flist_stream,running_feature_report_idx,t);
cout << t << "\t" << "feature report: " << num_updates << endl;
}
}
// all output and test should be done in this area
if (report_this_round)
{
time_t current_time = time(NULL);
double total_time = current_time - start_time;
clock_t start_test = clock();
int num_tested_peptides_in_train=0;
int num_tested_peptides_in_test=0;
int *ptr_train_num = (t<=1 ? &num_tested_peptides_in_train : NULL);
int *ptr_test_num = (t<=1 ? &num_tested_peptides_in_test : NULL);
vector<peak_rank_stat> train_peak_stats, test_peak_stats;
train_error = calc_prediction_error(training_ds, train_peak_stats,
test_tag3_filter_val, ptr_train_num);
test_error = 1.0;
if (test_ds)
test_error = calc_prediction_error(*test_ds, test_peak_stats,
test_tag3_filter_val, ptr_test_num);
cout << "Round\t" << t << "\t#rf " << non_zero_real_idxs.size() << "\ttime: " << total_time << " secs.\t" << setprecision(5) << "[ " <<
setprecision(5) << train_error << " " << test_error << "]" << endl;
// these should only be outputted for the first round
if (ptr_train_num)
{
out_stream << "ERRORS MEASURED FOR TAG3 VAL " << test_tag3_filter_val << endl;
out_stream << "TRAIN ERRORS FROM " << *ptr_train_num << endl;
if (report_prefix)
{
stat_stream << "ERRORS MEASURED FOR TAG3 VAL " << test_tag3_filter_val << endl;
stat_stream << "TRAIN ERRORS FROM " << *ptr_train_num << endl;
}
}
if (ptr_test_num)
{
out_stream << "TEST ERRORS FROM " << *ptr_test_num << endl;
if (report_prefix)
stat_stream << "TEST ERRORS FROM " << *ptr_test_num << endl;
}
out_stream << setprecision(7);
out_stream << "Round " << t << "\t" << setprecision(7) << fixed << (int)total_time << "\t";
if (num_binary_features>0)
out_stream << "Act bin " << non_zero_binary_idxs.size() << "/" << num_binary_features ;
if (num_real_features>0)
out_stream << " Act real " << non_zero_real_idxs.size() << "/" << num_real_features << endl;
if (num_binary_features>0)
{
out_stream << "Best BINARY feature: " << best_binary_idx << " " << binary_feature_names[best_binary_idx] <<
" r: " << best_binary_r << endl;
}
if (num_real_features>0)
{
out_stream << "Best REAL feature : " << real_feature_idx << " " << real_feature_names[real_feature_idx] <<
" theta: " << real_theta_start_idx << "-" <<
real_theta_end_idx << " r: " << real_r << endl;
}
clock_t end_test = clock();
double test_time = (end_test-start_test)/(double)CLOCKS_PER_SEC;
out_stream << setprecision(6);
out_stream << "train: " << train_error;
if (test_ds)
out_stream << "\ttest: " << test_error;
out_stream << "\tZ_prod = " << Z_prod << "\t" << "(" <<
test_time << ")" << endl;
out_stream << endl;
// full stats
if (report_prefix)
{
stat_stream << fixed << setprecision(6);
stat_stream << t << "\t" << (int)total_time << "\t" << non_zero_binary_idxs.size() <<
"\t" << non_zero_real_idxs.size() << "\t" << train_error << "\t";
int stat_size = train_peak_stats.size();
if (test_ds)
{
stat_stream << test_error << "\t";
if (test_peak_stats.size()<train_peak_stats.size())
stat_size = test_peak_stats.size();
}
stat_stream << endl;
// also to cout
out_stream << setprecision(4) << t << "\t" << fixed << (int)total_time << "\t" <<
non_zero_binary_idxs.size() << "\t" << non_zero_real_idxs.size() <<
"\t" << train_error << "\t";
if (test_ds)
out_stream << test_error << "\t";
out_stream << endl << endl;
// peak stats
train_res << t <<"\t" << setprecision(6) << train_error << "\t" << stat_size;
int j;
for (j=0; j<stat_size; j++)
train_res << "\t" << setprecision(4) << train_peak_stats[j].precent_predicted_correctly;
for (j=0; j<stat_size; j++)
train_res << "\t" << setprecision(4) << train_peak_stats[j].avg_precited_rank + 1.0 << "\t"
<< setprecision(5) << train_peak_stats[j].sd_predicted_rank;
train_res << endl;
if (test_ds)
{
test_res << t <<"\t" << setprecision(6) << test_error << "\t" << stat_size;
int j;
for (j=0; j<stat_size; j++)
test_res << "\t" << setprecision(4) << test_peak_stats[j].precent_predicted_correctly;
for (j=0; j<stat_size; j++)
test_res << "\t" << setprecision(4) << test_peak_stats[j].avg_precited_rank + 1.0 << "\t"
<< setprecision(5) << test_peak_stats[j].sd_predicted_rank;
test_res << endl;
}
if (t % 100 == 0)
{
stat_stream.flush();
out_stream.flush();
}
if (0 && top_misclassified_pairs)
{
get_top_misclassified_pairs(training_ds,D,D0,*top_misclassified_pairs);
print_top_misclassified_pairs(training_ds,D,D0,10,out_stream);
}
}
// check if this is the best round
// output the current model file
if (test_error<best_test_error && ((t % 100 == 0) || (t<100 && t % 10 == 0)))
{
set_best_model_parameters_to_current_parameters();
// write the model. Since compressing messes it up, we will copy the model
// to a new one and write that one
if (t % 100 == 0 && report_prefix)
{
RankBoostModel copy_of_model = *this;
char name_buff[512];
sprintf(name_buff,"%s_model.txt",report_prefix);
ofstream model_stream(name_buff);
if (! model_stream.is_open() || ! model_stream.good())
{
cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
exit(1);
}
if (model_header_strings)
{
int i;
for (i=0; i<model_header_strings->size(); i++)
model_stream << model_header_strings->at(i);
}
copy_of_model.write_rankboost_model(model_stream);
model_stream.close();
sprintf(name_buff,"%s_feature_list.txt",report_prefix);
ofstream feature_stream(name_buff);
if (! feature_stream.is_open() || ! feature_stream.good())
{
cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
exit(1);
}
copy_of_model.ouput_importance_ranked_feature_list(training_ds,feature_stream);
feature_stream.close();
}
}
// if for the last 20% of the rounds we had no improvement in the test error
// we can stop!
if (t>100)
{
double stop_ratio = (t>100000 ? 1.3 : 2.0);
if (t<1000)
stop_ratio = 3.0;
if(t<20000)
stop_ratio = 2.5;
if (max_num_rounds<1000000 && // if it is 1000000 then let it run...
t/(double)best_round_idx>=stop_ratio &&
best_test_error<test_error)
{
out_stream << "TERMINATING AT ROUND " << t << ", NO PROGRESS IN TEST ERROR SINCE ROUND " <<
best_round_idx << endl << endl;
out_stream << fixed << setprecision(6) << "Current test error: " << test_error << ", best test error " << best_test_error << endl;
break;
}
}
// check signal
if (stop_signal_file)
{
ifstream signal_stream(stop_signal_file);
if (signal_stream.is_open())
{
out_stream << endl << "TERMINATED BECAUSE STOP FILE WAS DETECTED!" << endl;
out_stream << "( " << stop_signal_file << " )" << endl;
break;
}
}
}
}
set_current_model_parameters_to_best_parameters();
vector<peak_rank_stat> dummy_stats;
train_error = calc_prediction_error(training_ds, dummy_stats, test_tag3_filter_val);
test_error = 1.0;
if (test_ds)
test_error = calc_prediction_error(*test_ds, dummy_stats, test_tag3_filter_val);
out_stream << setprecision(6) << fixed << "FINAL ERRORS:" << endl;
out_stream << "train:\t" << train_error << endl;
out_stream << "test:\t" << test_error << endl;
if (top_misclassified_pairs)
get_top_misclassified_pairs(training_ds,D,D0,*top_misclassified_pairs);
if (out_file_stream.is_open())
out_file_stream.close();
if (stat_file_stream.is_open())
stat_file_stream.close();
if (train_res.is_open())
train_res.close();
if (test_res.is_open())
test_res.close();
if (flist_stream.is_open())
flist_stream.close();
ind_was_initialized = true;
cout << "Finished training boost model (" << t-1 << " rounds)" << endl;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -