📄 me_reg.h
字号:
#ifndef __ME_REG_H__
#define __ME_REG_H__
#include "includes.h"
typedef float value_t;
struct fval {
fval() : f_idx(-1), val(0) {};
fval(int _f_idx, value_t _val) : f_idx(_f_idx), val(_val) {};
bool operator< (const fval& other) const
{
return f_idx<other.f_idx;
}
int f_idx;
value_t val;
};
// The structure for a sample for a two class logistic regression
// The first class has feature values, the other is always 0
struct ME_Regression_Sample {
ME_Regression_Sample() : label(-1) , prop(-1), prop_score(NEG_INF), weight(1) { }
void print(const char **feature_names=NULL) const;
void remove_feature(int f_idx);
value_t get_feature_value(int f_idx) const
{
int i;
for (i=0; i<f_vals.size(); i++)
if (f_vals[i].f_idx==f_idx)
return f_vals[i].val;
return 0;
}
int label; // the true class of the sample
int prop; // optional additional property that is not used in the model
double prop_score; // optional additional property
double weight; // the weight of this sample
vector<fval> f_vals; // holds feature values
};
class ME_Regression_DataSet {
public:
ME_Regression_DataSet() : num_samples(0), num_features(0), total_weight(0), num_classes(2) { samples.clear(); }
ME_Regression_DataSet(int exp_samples) : num_samples(0) , num_features(0), num_classes(2),
total_weight(0) { samples.reserve(exp_samples); }
void clear(int num_classes =2);
void add_sample(const ME_Regression_Sample& sam);
void add_samples(const vector<ME_Regression_Sample>& samples);
void randomly_remove_samples_with_activated_feature(int label, int feature_idx, float prob_remove = 0.9);
void tally_samples(bool print = false);
// checks that features are within bounds and values are ok (not nan)
int check_samples(bool remove_bad);
void print() const;
void print_summary() const;
// prints info on features (num non zero and p~(f) )
void print_feature_summary(ostream& os = cout, const char ** feature_names = NULL) const;
// Splits the dataset into two new mutually exclusive set, must supply
// a vector with the indices of the test set, all the other samples
// are sent to the training set.
void split_set(ME_Regression_DataSet& training,ME_Regression_DataSet& test,
vector<int> test_idxs) const;
// output dataset, if null goes to screnn
void write_data_to_file(const char *file=NULL) const;
// calibrates class weights so class 0 has the desired weight
void calibrate_class_weights(double class_0_weight);
// calculates for each feature the proportion of samples (weight)
// for which the feature has a non-zero value
void calc_feature_non_zero_weights(vector< vector<double> >& ratios,
vector< vector<double> >& avg_nz) const;
// gives various statistics about a certain feature's values
void report_feature_statistics(int f_idx, char *name = NULL) const;
// calibrates the weight of the entire samples so the new weight adds
// up to *total_weight*
void rescale_dataset_weights(double total_weight);
// extracts all the samples of the given class and puts them in a new
// dataset
void extract_class_samples(int label, ME_Regression_DataSet& extract) const;
// exctract samples that have a non-zero value for the given feature
void extract_samples_with_activated_feature(int feature_idx,ME_Regression_DataSet& extract) const;
// adds the samples from the other dataset, and adjust weights
void add_other_dataset_samples(const ME_Regression_DataSet& other);
// changes the weights of the data points so according to the weights
// given in ratios (must have k ratios for values of prop that
// are assumed to be 0,1,..,k-1
void rescale_dataset_according_to_prop(const vector<double>& ratios);
// return all samples in the datatset that have a desired label
void get_samples_with_label(int label, vector<int>& idxs) const;
// removes features that have low occurrence counts (to avoid over-fitting
// and model stability issues)
void purge_low_count_features(int min_num_occurrences);
// returns the relative weight in the class of a certain feature
double get_relative_weight_of_feature(int label, int feature_idx) const;
// sets the weights of samples in the class in such a way that the relative weight of samples
// with non-zero values for the given feature is given in the relative_weight
void scale_samples_to_feature_relative_weight(int label, int feature_idx, double relative_weight);
void serial_scale(const vector<int>& feature_idxs);
int max_label; // maximum index of a class label, labels should be 0,..,k-1
int num_classes; // number of classes k in the data = max_label+1
int num_samples;
int num_features;
double total_weight;
vector<double> class_weights;
vector<ME_Regression_Sample> samples;
};
class ME_Regression_Model {
public:
ME_Regression_Model() : num_features(-1), num_classes(0), t_iterations(0),
has_weights(false) { f_weights.clear(); }
// trains the model's weights using IIS
// either specify the number of iterations, or the minimal improvement in log likelihood
// (min ratio for improvement/log_likelihood)
void train_IIS2(const ME_Regression_DataSet& ds,
int max_iterations,
double min_log_like_improve,
vector<int>& feature_idxs);
// trains model using CG - Logistic Regression
// returns false if there was no convergence (numerical stability issues...)
bool train_cg(const ME_Regression_DataSet& ds, int max_interations=100,
double epsilon=1E-4, int reset_rounds =0 );
// takes all the samples in the dataset with the right label and calculates the probs
// sorts them. Let x be the probability at the desired percentile, and t be the
// target probability. The function retutns y, s.t. x^y=t
float calc_log_scaling_constant(int label,
const ME_Regression_DataSet& ds,
float target_prob) const;
void report_exp_sums(const ME_Regression_Sample& sam) const;
double p_y_given_x(int label,const ME_Regression_Sample& sam) const
{
int i;
double e,sum_exp=0;
for (i=0; i<sam.f_vals.size(); i++)
{
const int& f_idx = sam.f_vals[i].f_idx;
sum_exp+=f_weights[f_idx]*sam.f_vals[i].val;
}
if (sum_exp<-20)
sum_exp=-20;
if (sum_exp>20)
sum_exp=20;
e=exp(sum_exp);
if (label == 0)
return (e/(1.0 + e));
return (1.0 / (1.0 + e));
}
float get_sum_exp(const ME_Regression_Sample& sam) const
{
int i;
float sum_exp=0;
for (i=0; i<sam.f_vals.size(); i++)
{
const int& f_idx = sam.f_vals[i].f_idx;
sum_exp+=f_weights[f_idx]*sam.f_vals[i].val;
}
return sum_exp;
}
// sets the weights to the model returns a constant probability p for the class 0 samples
void set_weigts_for_const_prob(float p);
void print_ds_probs(const ME_Regression_DataSet& ds) const;
void print_ds_histogram(const ME_Regression_DataSet& ds) const;
double log_likelihood(const ME_Regression_DataSet& ds) const;
void print_weights_c();
void read_weights_line(char *buff);
void write_regression_model(ostream& os = cout) const;
void read_regression_model(istream& is);
int get_num_features() const { return num_features; }
void set_num_features(int n) { num_features = n; f_weights.resize(n); };
int get_num_classes() const { return num_classes; }
double get_weight(int w) const { return f_weights[w]; }
void set_num_classes(int nc) { num_classes=nc; }
bool get_has_weights() const { return has_weights; }
private:
// LOGISTIC REGRESSION FUNCTIONS
/************************************************************************
// calculates p(y|x) for all x,y
// first index in p is class index 0..k-1
// second index in p is sample number 0..n-1
*************************************************************************/
void calc_p_for_all_samples(const ME_Regression_DataSet& ds, vector<double>& p) const;
/*********************************************************************
Calculates the vector for the first derivatives with the current lambdas
This function assumes that the values of p and fp are already computed.
**********************************************************************/
void calc_neg_first_derivative(const ME_Regression_DataSet& ds, const vector<double>& p,
vector<double>& vals) const;
/******************************************************************
Calculates the matrix for the Hesian of the current lambdas.
This function assumes that the values of p and fp are already computed.
*******************************************************************/
void calc_Hesian(const ME_Regression_DataSet& ds, const vector<double>& p,
vector< vector<double> >& hes) const;
/******************************************************************
Calculates the matrix for the Hesian of the current lambdas.
This function assumes that the values of p and fp are already computed.
*******************************************************************/
void calc_Hesian2(const ME_Regression_DataSet& ds, const vector<double>& p,
vector< vector<double> >& hes) const;
int t_iterations; // number of interations in training cycle
int num_features;
int num_classes;
vector<double> f_weights; // the weights lambda_i of the features
bool has_weights;
};
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -