📄 me_regression_dataset.cpp
字号:
}
// returns the relative weight in the class of a certain feature
double ME_Regression_DataSet::get_relative_weight_of_feature(int label, int feature_idx) const
{
double class_weight=0;
double feature_weight=0;
int i;
for (i=0; i<samples.size(); i++)
{
if (samples[i].label != label)
continue;
class_weight += samples[i].weight;
if (samples[i].get_feature_value(feature_idx) != 0)
feature_weight+=samples[i].weight;
}
return (feature_weight/class_weight);
}
// sets the weights of samples in the class in such a way that the relative weight of samples
// with non-zero values for the given feature is given in the relative_weight
void ME_Regression_DataSet::scale_samples_to_feature_relative_weight(int label,
int feature_idx, double relative_weight)
{
double class_weight=0;
double feature_weight=0;
vector<bool> sams_ind;
sams_ind.resize(samples.size(),false);
int i;
for (i=0; i<samples.size(); i++)
{
if (samples[i].label != label)
continue;
class_weight += samples[i].weight;
if (samples[i].get_feature_value(feature_idx) != 0)
{
feature_weight+=samples[i].weight;
sams_ind[i]=true;
}
}
double org_weight=feature_weight/class_weight;
if (org_weight<=0)
return;
double mult_feature = relative_weight/org_weight;
if (mult_feature<0.2)
mult_feature=0.2;
if (mult_feature>5)
mult_feature=5;
relative_weight = mult_feature*org_weight;
double mult_others = (1.0-relative_weight)/(1.0-org_weight);
for (i=0; i<samples.size(); i++)
samples[i].weight *= (sams_ind[i] ? mult_feature : mult_others);
tally_samples();
}
/**************************************************************************
Tries to scale
***************************************************************************/
void ME_Regression_DataSet::serial_scale(const vector<int>& feature_idxs)
{
int i;
for (i=0; i<feature_idxs.size(); i++)
{
const int f_idx = feature_idxs[i];
double ratio = get_relative_weight_of_feature(0,f_idx);
scale_samples_to_feature_relative_weight(1,f_idx,ratio);
}
}
void ME_Regression_DataSet::report_feature_statistics(int f_idx, char *name) const
{
vector<double> vals0,vals1;
double avg_nz0=0, avg_nz1=0;
double wnz0=0, wnz1=0, wz0=0, wz1=0;
int i;
for (i=0; i<samples.size(); i++)
{
double val = 0;
int j;
for (j=0; j<samples[i].f_vals.size(); j++)
{
if (samples[i].f_vals[j].f_idx == f_idx)
{
val = samples[i].f_vals[j].val;
break;
}
}
double weight = samples[i].weight;
int label = samples[i].label;
if (val != 0)
{
if (label == 0)
{
wnz0+= weight;
avg_nz0 += weight * val;
vals0.push_back(val);
}
else
{
wnz1+= weight;
avg_nz1 += weight * val;
vals1.push_back(val);
}
}
else
{
if (label == 0)
{
wz0+=weight;
}
else
wz1+=weight;
}
}
if (avg_nz0 != 0)
avg_nz0/=wnz0;
if (avg_nz1 != 0)
avg_nz1/=wnz1;
printf("Statistics for feature %d ",f_idx);
if (name)
printf(" %s",name);
printf("\n");
printf("Class 0:\n");
printf("weight samples with non-zero vals: %.3f (%.2f) samples with zero val: %.3f (%.2f)\n",
wnz0,wnz0/(wnz0+wz0),wz0,wz0/(wnz0+wz0));
printf("Avg weighted: %g non-weighted vals:\n",avg_nz0);
sort(vals0.begin(),vals0.end());
// prints avgs of tenths of the values
int ts=vals0.size()/10;
int p=0;
for (i=0; i<9; i++)
{
int next=p+ts;
int j;
double av=0;
for (j=p; j<next; j++)
av+=vals0[j];
printf("%.4f ",av/ts);
p+=ts;
}
double av=0;
for (i=p; i<vals0.size(); i++)
av+=vals0[i];
printf("%.4f\n",av/(vals0.size()-p));
printf("Class 1:\n");
printf("weight samples with non-zero vals: %.3f (%.2f) samples with zero val: %.3f (%.2f)\n",
wnz1,wnz1/(wnz1+wz1),wz1,wz1/(wnz1+wz1));
printf("Avg weighted: %g non-weighted vals:\n",avg_nz1);
sort(vals1.begin(),vals1.end());
// prints avgs of tenths of the values
ts=vals1.size()/10;
p=0;
for (i=0; i<9; i++)
{
int next=p+ts;
int j;
double av=0;
for (j=p; j<next; j++)
av+=vals1[j];
printf("%.4f ",av/ts);
p+=ts;
}
av=0;
for (i=p; i<vals1.size(); i++)
av+=vals1[i];
printf("%.4f\n\n\n",av/(vals1.size()-p));
}
// extracts all the samples of the given class and puts them in a new dataset
void ME_Regression_DataSet::extract_class_samples(int label, ME_Regression_DataSet& extract) const
{
int i;
extract.samples.clear();
extract.num_samples = 0;
extract.num_classes = num_classes;
for (i=0; i<num_samples; i++)
if (samples[i].label == label)
extract.add_sample(samples[i]);
extract.tally_samples();
}
// exctract samples that have a non-zero value for the given feature
void ME_Regression_DataSet::extract_samples_with_activated_feature(int feature_idx,
ME_Regression_DataSet& extract) const
{
int i;
extract.samples.clear();
extract.num_samples = 0;
extract.num_classes = num_classes;
for (i=0; i<num_samples; i++)
{
int j;
for (j=0; j<samples[i].f_vals.size(); j++)
if (samples[i].f_vals[j].f_idx == feature_idx && samples[i].f_vals[j].val != 0)
extract.add_sample(samples[i]);
}
extract.tally_samples();
}
// adds the samples from the other dataset, and adjust weights
void ME_Regression_DataSet::add_other_dataset_samples(const ME_Regression_DataSet& other)
{
int i;
for (i=0; i<other.num_samples; i++)
add_sample(other.samples[i]);
tally_samples();
}
// return all samples in the datatset that have a desired label
void ME_Regression_DataSet::get_samples_with_label(int label, vector<int>& idxs) const
{
int i;
idxs.clear();
for (i=0; i<samples.size(); i++)
if (samples[i].label== label)
idxs.push_back(i);
}
// prints info on features (num non zero and p~(f) )
void ME_Regression_DataSet::print_feature_summary(ostream& os, const char **feature_names) const
{
int i;
vector< vector<double> > ratios, avg_nz;
calc_feature_non_zero_weights(ratios,avg_nz);
for (i=0; i<num_features; i++)
{
os << setw(4) << left << i << " ";
os << setw(10) << left << setprecision(3) << ratios[i][0] << " " << setw(10) << setprecision(3) << left << ratios[i][1] << " ";
os << " ( " << setw(6) << setprecision(3) << left << avg_nz[i][0] << " , " << setw(6) << setprecision(3) << left << avg_nz[i][1] << ") ";
os << " " << setw(6);
if (feature_names)
cout << feature_names[i];
cout << endl;
}
}
void ME_Regression_DataSet::clear(int num_classes)
{
num_classes=num_classes; // number of classes k in the data = max_label+1
num_samples=0;
num_features=0;
class_weights.clear();
if (num_classes>0)
class_weights.resize(num_classes,0);
samples.clear();
}
void ME_Regression_Sample::print(const char **feature_names) const
{
int j;
if (! feature_names)
{
cout << "> " << label << " " <<weight << endl;
for (j=0; j<f_vals.size(); j++)
cout << f_vals[j].f_idx << " " << f_vals[j].val << " ";
cout << endl;
return;
}
cout << "LABEL " << label << ", weight " << weight << endl;
for (j=0; j<f_vals.size(); j++)
{
cout << f_vals[j].f_idx << "\t" << setprecision(3) << fixed << f_vals[j].val << "\t" <<
feature_names[f_vals[j].f_idx] << endl;
}
cout << endl;
}
void ME_Regression_Sample::remove_feature(int f_idx)
{
int f;
for (f=0; f<f_vals.size(); f++)
if (f_vals[f].f_idx == f_idx)
break;
if (f==f_vals.size())
return;
if (f == f_vals.size()-1)
{
f_vals.pop_back();
return;
}
int i;
for (i=f+1; i<f_vals.size(); i++)
f_vals[i-1]=f_vals[i];
f_vals.pop_back();
}
void ME_Regression_DataSet::print() const
{
int i;
for (i=0; i<num_samples; i++)
samples[i].print();
}
void ME_Regression_DataSet::print_summary() const
{
int j;
printf("Classes %d\n",num_classes);
printf("Samples %d\n",num_samples);
printf("Total weight %.3f\n",total_weight);
printf("Relative class weights:\n");
for (j=0; j<num_classes; j++)
printf("%d - %.4f\n",j,class_weights[j]/total_weight);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -