📄 rankboost.cpp
字号:
#include "RankBoost.h"
#include "auxfun.h"
bool RankBoostModel::read_rankboost_model(istream& is)
{
num_binary_features = 0;
num_real_features =0;
int num_non_zero_real_features=0;
is >> num_binary_features;
if (num_binary_features>0)
{
binary_feature_names.resize(num_binary_features);
binary_weights.resize(num_binary_features,0);
ind_active_binary_feature.resize(num_binary_features,true);
int i;
for (i=0; i<num_binary_features; i++)
{
int idx=-1;
is >> idx >> binary_weights[i] >> binary_feature_names[i];
if (idx<0 || binary_weights[i] == 0)
{
cout << "Error: reading binary weight line " << i << endl;
exit(1);
}
}
}
is >> num_real_features;
is >> num_non_zero_real_features;
if (num_real_features>0)
{
real_feature_names.resize(num_real_features);
real_default_weights.resize(num_real_features,0);
real_weights.resize(num_real_features);
real_limits.resize(num_real_features);
ind_active_real_feature.resize(num_real_features,false);
non_zero_real_idxs.clear();
int i;
for (i=0; i<num_non_zero_real_features; i++)
{
int num_weights=0;
int num_limits=0;
int j;
char buff[1024];
is.getline(buff,1024);
is.getline(buff,1024);
int feature_idx;
double weight;
if (sscanf(buff,"%d\t%lf",&feature_idx,&weight) != 2)
{
cout << "Error: bad line in model file:" << endl << buff << endl;
exit(1);
}
real_default_weights[feature_idx]=weight;
const int len = strlen(buff);
int tab_count=0;
for (j=0; j<len && tab_count<2; j++)
if (buff[j]=='\t')
tab_count++;
if (tab_count<2)
{
cout << "Error: bad tab count!" << endl;
exit(1);
}
real_feature_names[feature_idx] = string(buff+j);
ind_active_real_feature[feature_idx] = true;
non_zero_real_idxs.push_back(feature_idx);
string &fn = real_feature_names[feature_idx];
is >> num_limits;
real_limits[feature_idx].resize(num_limits,NEG_INF);
for (j=0; j<num_limits; j++)
is >> real_limits[feature_idx][j];
is >> num_weights;
real_weights[feature_idx].resize(num_weights,0);
for (j=0; j<num_weights; j++)
is >> real_weights[feature_idx][j];
}
// this last part is done to move to the next line (so the next getline gets a new line)
char buff[16];
is.getline(buff,10);
if (strlen(buff)>1)
{
cout << "Error: bad parsing of RankBoostModel!" << endl;
exit(1);
}
}
sort(non_zero_real_idxs.begin(),non_zero_real_idxs.end());
this->total_default_weight=0;
int i;
for (i=0; i<num_real_features; i++)
total_default_weight+=real_default_weights[i];
ind_was_initialized=true;
return true;
}
void RankBoostModel::write_rankboost_model(ostream& os, bool ind_compress)
{
// compresses the weights and limits before writing
if (ind_compress)
compress_real_limits_and_weights();
if (! ind_was_initialized)
{
cout << "Error: trying to write an unitialized RankBoost model!" << endl;
exit(1);
}
int i;
int num_non_zero_bin = 0;
for (i=0; i<num_binary_features; i++)
if (binary_weights[i] != 0)
num_non_zero_bin++;
os << num_non_zero_bin << endl;
os << setprecision(8);
for (i=0; i<num_binary_features; i++)
if (binary_weights[i] != 0)
os << i << "\t" << binary_weights[i] << "\t" << binary_feature_names[i] << endl;
// assumes the limits and weights were already compressed, otherwise the model
// will be very large
int num_non_zero_real = 0;
for (i=0; i<num_real_features; i++)
if (real_weights[i].size()>0)
{
num_non_zero_real++;
}
os << num_real_features << endl;
os << num_non_zero_real << endl;
for (i=0; i<num_real_features; i++)
{
if ( real_weights[i].size() == 0 )
continue;
os << i << "\t" << real_default_weights[i] << "\t" << real_feature_names[i] << endl;
os << real_limits[i].size();
int j;
for (j=0; j<real_limits[i].size(); j++)
os << "\t" << real_limits[i][j];
os << endl;
os << real_weights[i].size();
for (j=0; j<real_weights[i].size(); j++)
os << "\t" << real_weights[i][j];
os << endl;
}
}
/***********************************************************************
// compresses the number of limits and weights to represent steps in the weight
// function without repeatition of the same weights in different limits
************************************************************************/
void RankBoostModel::compress_real_limits_and_weights()
{
int i;
for (i=0; i<num_real_features; i++)
{
if (real_limits[i].size() < 1)
{
real_weights[i].clear();
real_limits[i].clear();
continue;
}
else
{
int j;
if (real_default_weights[i]==0)
{
for (j=0; j<real_weights[i].size(); j++)
if (real_weights[i][j] != 0)
break;
if (j== real_weights[i].size())
{
real_weights[i].clear();
real_limits[i].clear();
continue;
}
}
}
vector<float> new_limits;
vector<weight_t> new_weights;
new_weights.push_back(real_weights[i][0]);
new_limits.push_back(real_limits[i][0]);
int j;
for (j=1; j<real_limits[i].size(); j++)
{
if (real_weights[i][j] != real_weights[i][j+1])
{
new_weights.push_back(real_weights[i][j]);
new_limits.push_back(real_limits[i][j]);
}
}
new_weights.push_back(real_weights[i][j]); // last weight is for values larger than the last limit
real_limits[i]=new_limits;
real_weights[i]=new_weights;
}
remove_default_weights();
}
/*************************************************************************
Removes the default weight by subratcing them from everybody else
Also moves the min/max values towards 0 if they are all above or below it
**************************************************************************/
void RankBoostModel::remove_default_weights()
{
int i;
for (i=0; i<num_real_features; i++)
{
if (this->real_default_weights[i] != 0)
{
int j;
for (j=0 ;j<real_weights[i].size(); j++)
real_weights[i][j] -= real_default_weights[i];
real_default_weights[i]=0;
}
}
}
/*************************************************************************
**************************************************************************/
double RankBoostModel::calc_training_rank_error(const RankBoostDataset& training_ds) const
{
const int num_samples = training_ds.get_num_samples();
const vector<RankBoostSample>& samples = training_ds.get_samples();
vector<weight_t> rank_scores;
rank_scores.resize(num_samples);
int i;
for (i=0; i<num_samples; i++)
rank_scores[i]=calc_rank_score(samples[i]);
const vector<SamplePairWeight>& phi_support = training_ds.get_phi_support();
double train_error=0;
for (i=0; i<phi_support.size(); i++)
{
if (rank_scores[phi_support[i].idx1]>=rank_scores[phi_support[i].idx2])
train_error+=phi_support[i].weight;
}
return train_error/training_ds.get_total_phi_weight();
}
struct rank_score_pair {
rank_score_pair() : rank(999), score(NEG_INF) {};
rank_score_pair(int _r, float _s) : rank(_r), score(_s) {};
bool operator< (const rank_score_pair& other) const
{
return (score>other.score);
}
int rank;
float score;
};
/***************************************************************************
if tag3 != -1, it is used to filter out samples that don't have the same tag value.
Calculates the weighted ranking error for a given dataset.
Also calculates the error in peak ranking for all peptides:
Avg predicted rank for strongest peak and sd of prediction for strongest peaks,
Avg predicted rank fro 2nd strongest peak...
****************************************************************************/
double RankBoostModel::calc_prediction_error(const RankBoostDataset& rank_ds,
vector<peak_rank_stat>& peak_stats,
int tag3_filter_val,
int *ptr_num_groups_tested) const
{
const int num_samples = rank_ds.get_num_samples();
const vector<RankBoostSample>& samples = rank_ds.get_samples();
const int num_groups = rank_ds.get_num_groups();
vector<weight_t> rank_scores;
rank_scores.resize(num_samples,NEG_INF);
if (ptr_num_groups_tested)
{
vector<bool> group_inds;
group_inds.resize(num_groups,false);
int i;
for (i=0; i<num_samples; i++)
if (tag3_filter_val<0 || samples[i].tag3 == tag3_filter_val)
{
rank_scores[i]=calc_rank_score(samples[i]);
group_inds[samples[i].group_idx]=true;
}
*ptr_num_groups_tested=0;
for (i=0; i<group_inds.size(); i++)
if (group_inds[i])
(*ptr_num_groups_tested)++;
}
else
{
int i;
for (i=0; i<num_samples; i++)
if (tag3_filter_val<0 || samples[i].tag3 == tag3_filter_val)
rank_scores[i]=calc_rank_score(samples[i]);
}
int i;
const vector<SamplePairWeight>& phi_support = rank_ds.get_phi_support();
double error=0;
double phi_weight=0;
for (i=0; i<phi_support.size(); i++)
{
const int idx1 = phi_support[i].idx1;
const int idx2 = phi_support[i].idx2;
if (rank_scores[idx1]<=NEG_INF || rank_scores[idx2]<=NEG_INF)
continue;
if (rank_scores[idx1]>=rank_scores[idx2])
error+=phi_support[i].weight;
phi_weight += phi_support[i].weight;
}
double weighted_error = error/phi_weight;
// cout << ">>>" << tag3_filter_val << "\t" << error << "\t" << phi_weight << endl;
vector< vector<rank_score_pair> > peptide_peak_ranks;
peptide_peak_ranks.clear();
peptide_peak_ranks.resize(num_groups);
vector< vector<int> > peak_predictions; // org rank / predcited rank
peak_predictions.resize(100); // upto 100 strong peaks...
for (i=0; i<100; i++)
peak_predictions[i].resize(100,0);
for (i=0; i<num_samples; i++)
{
if (rank_scores[i]<=NEG_INF)
continue;
const RankBoostSample& sam = samples[i];
peptide_peak_ranks[sam.group_idx].push_back(rank_score_pair(sam.rank_in_group,rank_scores[i]));
}
for (i=0; i<peptide_peak_ranks.size(); i++)
{
if (peptide_peak_ranks[i].size() == 0)
continue;
sort(peptide_peak_ranks[i].begin(),peptide_peak_ranks[i].end());
int j;
for (j=0; j<peptide_peak_ranks[i].size(); j++)
{
const int org_rank = peptide_peak_ranks[i][j].rank;
const int predicted_rank = (j > 99 ? 99 : j);
if (org_rank<100)
peak_predictions[org_rank][predicted_rank]++;
}
}
peak_stats.clear();
for (i=0; i<100; i++)
{
vector<int> vals,counts;
int peak_count=0;
int j;
for (j=0; j<peak_predictions[i].size(); j++)
peak_count += peak_predictions[i][j];
if (peak_count < 5)
break;
double mean=0,sd=-1;
for (j=0; j<peak_predictions[i].size(); j++)
{
if (peak_predictions[i][j]>0)
{
vals.push_back(j);
counts.push_back(peak_predictions[i][j]);
}
}
calc_mean_sd_from_counts(vals,counts,&mean,&sd);
peak_rank_stat prs;
prs.true_rank=i;
prs.avg_precited_rank = mean;
prs.sd_predicted_rank = sd;
prs.precent_predicted_correctly = peak_predictions[i][i]/(float)peak_count;
peak_stats.push_back(prs);
}
return weighted_error;
}
/*************************************************************************
**************************************************************************/
weight_t RankBoostModel::calc_rank_score(const RankBoostSample& sample) const
{
const vector<int>& binary_idxs = sample.binary_non_zero_idxs;
weight_t score=0;
if (! ind_was_initialized)
{
cout << "Error: RankBoostModel not initialized!" << endl;
exit(1);
}
int i;
for (i=0; i<binary_idxs.size(); i++)
score += binary_weights[binary_idxs[i]];
// first add all no vote scores
score += total_default_weight;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -