📄 preprocessors.cpp
字号:
makerind.randomGenerator = randomGenerator ? randomGenerator : mlnew TRandomGenerator;;
// this will not assign the defaultProportion to the class
vector<float> props(domain.attributes->size(), defaultProportion > 0.0 ? defaultProportion : 0.0);
getProportions(proportions, domain, props);
int idx = 0;
vector<float>::const_iterator pi(props.begin()), pe(props.end());
for(; pi != pe; idx++, pi++)
if (*pi > 0.0) {
PLongList rind = makerind(n, 1 - *pi);
const unsigned char &varType = domain.variables->at(idx)->varType;
int eind = 0;
PITERATE(TLongList, ri, rind) {
if (*ri)
(*table)[eind][idx] = TValue(varType, specialType);
eind++;
}
}
return wtable;
}
TPreprocessor_addGaussianClassNoise::TPreprocessor_addGaussianClassNoise(const float &dev)
: deviation(dev)
{}
PExampleGenerator TPreprocessor_addGaussianClassNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
PVariable classVar = gen->domain->classVar;
if (!classVar)
raiseError("Class-less domain");
if (classVar->varType != TValue::FLOATVAR)
raiseError("Class '%s' is not continuous", gen->domain->classVar->name.c_str());
newWeight = weightID;
if (deviation>0.0) {
vector<pair<int, float> > deviations;
deviations.push_back(pair<int, float>(gen->domain->attributes->size(), deviation));
TGaussianNoiseGenerator gngen(deviations, gen, randomGenerator);
return PExampleGenerator(mlnew TExampleTable(PExampleGenerator(gngen)));
}
else
return mlnew TExampleTable(gen);
}
TPreprocessor_addMissingClasses::TPreprocessor_addMissingClasses(const float &cm, const int &st)
: proportion(cm),
specialType(st)
{}
PExampleGenerator TPreprocessor_addMissingClasses::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
if (!gen->domain->classVar)
raiseError("Class-less domain");
TExampleTable *table = mlnew TExampleTable(gen);
PExampleGenerator wtable = table;
if (proportion>0.0) {
TMakeRandomIndices2 mri2;
mri2.randomGenerator = randomGenerator;
PLongList rind(mri2(table->size(), 1-proportion));
const TVariable &classVar = table->domain->classVar.getReference();
const int &varType = classVar.varType;
int eind = 0;
PITERATE(TLongList, ri, rind) {
if (*ri)
(*table)[eind].setClass(TValue(varType, specialType));
eind++;
}
}
newWeight = weightID;
return wtable;
}
TPreprocessor_addClassWeight::TPreprocessor_addClassWeight()
: classWeights(mlnew TFloatList),
equalize(false)
{}
TPreprocessor_addClassWeight::TPreprocessor_addClassWeight(PFloatList cw, const bool &eq)
: equalize(eq),
classWeights(cw)
{}
PExampleGenerator TPreprocessor_addClassWeight::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
if (!gen->domain->classVar || (gen->domain->classVar->varType != TValue::INTVAR))
raiseError("Class-less domain or non-discrete class");
TExampleTable *table = mlnew TExampleTable(gen);
PExampleGenerator wtable = table;
const int nocl = gen->domain->classVar->noOfValues();
if (!equalize && !classWeights->size() || !nocl) {
newWeight = 0;
return wtable;
}
if (classWeights && classWeights->size() && (classWeights->size() != nocl))
raiseError("size of classWeights should equal the number of classes");
vector<float> weights;
if (equalize) {
PDistribution dist(getClassDistribution(gen, weightID));
const TDiscDistribution &ddist = CAST_TO_DISCDISTRIBUTION(dist);
if (ddist.size() > nocl)
raiseError("there are out-of-range classes in the data (attribute descriptor has too few values)");
if (classWeights && classWeights->size()) {
float tot_w = 0.0;
TFloatList::const_iterator cwi(classWeights->begin());
TDiscDistribution::const_iterator di(ddist.begin()), de(ddist.end());
for(; di!=de; di++, cwi++)
if (*di > 0.0)
tot_w += *cwi;
if (tot_w == 0.0) {
newWeight = 0;
return wtable;
}
float fact = tot_w * ddist.abs;
di = ddist.begin();
PITERATE(TFloatList, wi, classWeights)
weights.push_back(*wi / *(di++) * fact);
}
else { // no class weights, only equalization
int noNullClasses = 0;
{ const_ITERATE(TDiscDistribution, di, ddist)
if (*di>0.0)
noNullClasses++;
}
const float N = ddist.abs;
const_ITERATE(TDiscDistribution, di, ddist)
if (*di>0.0)
weights.push_back(N / noNullClasses / *di);
else
weights.push_back(1.0);
}
}
else // no equalization, only weights
weights = classWeights.getReference();
newWeight = getMetaID();
PEITERATE(ei, table)
(*ei).setMeta(newWeight, TValue(WEIGHT(*ei) * weights[(*ei).getClass().intV]));
return wtable;
}
PDistribution kaplanMeier(PExampleGenerator gen, const int &outcomeIndex, TValue &failValue, const int &timeIndex, const int &weightID);
PDistribution bayesSurvival(PExampleGenerator gen, const int &outcomeIndex, TValue &failValue, const int &timeIndex, const int &weightID, const float &maxTime);
TPreprocessor_addCensorWeight::TPreprocessor_addCensorWeight()
: outcomeVar(),
timeVar(),
eventValue(),
method(km),
maxTime(0.0),
addComplementary(false)
{}
TPreprocessor_addCensorWeight::TPreprocessor_addCensorWeight(PVariable ov, PVariable tv, const TValue &ev, const int &me, const float &mt)
: outcomeVar(ov),
timeVar(tv),
eventValue(ev),
method(me),
maxTime(0.0),
addComplementary(false)
{}
void TPreprocessor_addCensorWeight::addExample(TExampleTable *table, const int &weightID, const TExample &example, const float &weight, const int &complementary, const float &compWeight)
{
TExample ex = example;
ex.setMeta(weightID, TValue(weight));
table->addExample(ex);
if ((complementary >= 0) && (compWeight>0.0)) {
ex.setClass(TValue(complementary));
ex.setMeta(weightID, TValue(compWeight));
table->addExample(ex);
}
}
PExampleGenerator TPreprocessor_addCensorWeight::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
if (eventValue.isSpecial())
raiseError("'eventValue' not set");
if (eventValue.varType != TValue::INTVAR)
raiseError("'eventValue' invalid (discrete value expected)");
const int failIndex = eventValue.intV;
int outcomeIndex;
if (outcomeVar) {
outcomeIndex = gen->domain->getVarNum(outcomeVar, false);
if (outcomeIndex==ILLEGAL_INT)
raiseError("outcomeVar not found in domain");
}
else
if (gen->domain->classVar)
outcomeIndex = gen->domain->attributes->size();
else
raiseError("'outcomeVar' not set and the domain is class-less");
int complementary = addComplementary ? eventValue.intV : -1;
checkProperty(timeVar);
int timeIndex = gen->domain->getVarNum(timeVar, false);
if (timeIndex==ILLEGAL_INT)
raiseError("'timeVar' not found in domain");
TExampleTable *table = mlnew TExampleTable(gen->domain);
PExampleGenerator wtable = table;
if (method == linear) {
float thisMaxTime = maxTime;
if (thisMaxTime<=0.0)
PEITERATE(ei, table) {
const TValue &tme = (*ei)[timeIndex];
if (!tme.isSpecial()) {
if (tme.varType != TValue::FLOATVAR)
raiseError("invalid time (continuous attribute expected)");
else
if (tme.floatV>thisMaxTime)
thisMaxTime = tme.floatV;
}
}
if (thisMaxTime<=0.0)
raiseError("invalid time values (max<=0)");
newWeight = getMetaID();
PEITERATE(ei, gen) {
if (!(*ei)[outcomeIndex].isSpecial() && (*ei)[outcomeIndex].intV==failIndex)
addExample(table, newWeight, *ei, WEIGHT(*ei), complementary);
else {
const TValue &tme = (*ei)[timeIndex];
// need to check it again -- the above check is only run if maxTime is not given
if (tme.varType != TValue::FLOATVAR)
raiseError("invalid time (continuous attribute expected)");
if (!tme.isSpecial())
addExample(table, newWeight, *ei, WEIGHT(*ei) * (tme.floatV>thisMaxTime ? 1.0 : tme.floatV / thisMaxTime), complementary);
}
}
}
else if ((method == km) || (method == bayes)) {
if ((km==bayes) && (maxTime<=0.0))
raiseError("'maxTime' should be set when 'method' is 'Bayes'");
PDistribution KM = (method == km) ? kaplanMeier(gen, outcomeIndex, eventValue, timeIndex, weightID)
: bayesSurvival(gen, outcomeIndex, eventValue, timeIndex, weightID, maxTime);
float KM_max = maxTime>0.0 ? KM->p(maxTime) : (*KM.AS(TContDistribution)->distribution.rbegin()).second;
newWeight = getMetaID();
PEITERATE(ei, gen) {
if (!(*ei)[outcomeIndex].isSpecial() && (*ei)[outcomeIndex].intV==failIndex)
addExample(table, newWeight, *ei, WEIGHT(*ei), -1);
else {
const TValue &tme = (*ei)[timeIndex];
if (tme.varType != TValue::FLOATVAR)
raiseError("invalid time (continuous attribute expected)");
if (tme.varType != TValue::FLOATVAR)
raiseError("invalid time (continuous value expected)");
if (!tme.isSpecial()) {
if (tme.floatV > maxTime)
addExample(table, newWeight, *ei, WEIGHT(*ei), -1);
else {
float KM_t = KM->p(tme.floatV);
if (method==km) {
if (KM_t>0) {
float origw = WEIGHT(*ei);
float fact = KM_max/KM_t;
addExample(table, newWeight, *ei, origw*fact, complementary, origw*(1-fact));
}
}
else {
float origw = WEIGHT(*ei);
addExample(table, newWeight, *ei, origw*KM_t, complementary, origw*(1-KM_t));
}
}
}
}
}
}
else
raiseError("unknown weighting method");
return wtable;
}
TPreprocessor_discretize::TPreprocessor_discretize()
: attributes(),
discretizeClass(false),
method()
{}
TPreprocessor_discretize::TPreprocessor_discretize(PVarList attrs, const bool &nocl, PDiscretization meth)
: attributes(attrs),
discretizeClass(nocl),
method(meth)
{}
PExampleGenerator TPreprocessor_discretize::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
checkProperty(method);
TVarList discretized;
vector<int> discretizedMetas;
TDomain *newDomain = mlnew TDomain();
PDomain wdomain(newDomain);
const TDomain &domain = gen->domain.getReference();
const_PITERATE(TVarList, vi, domain.variables)
if ( ((*vi)->varType == TValue::FLOATVAR)
&& ( !attributes || !attributes->size()
|| exists(attributes->begin(), attributes->end(), *vi))) {
PVariable evar = method->operator()(gen, *vi);
newDomain->variables->push_back(evar);
newDomain->attributes->push_back(evar);
discretized.push_back(*vi);
}
else {
newDomain->variables->push_back(*vi);
newDomain->attributes->push_back(*vi);
}
if (gen->domain->classVar) {
newDomain->classVar = newDomain->variables->back();
newDomain->attributes->erase(newDomain->attributes->end()-1);
}
if (attributes)
PITERATE(TVarList, ai, attributes)
if (!exists(discretized.begin(), discretized.end(), *ai)) {
long varNum = domain.getVarNum(*ai);
if (varNum == ILLEGAL_INT)
raiseError("Attribute '%s' is not found", (*ai)->name.c_str());
else if ((varNum >= 0) || ((*ai)->varType != TValue::FLOATVAR))
raiseError("Attribute '%s' is not continuous", (*ai)->name.c_str());
else {
PVariable evar = method->operator()(gen, *ai);
TMetaDescriptor ndsc(varNum, evar);
newDomain->metas.push_back(ndsc);
discretizedMetas.push_back(varNum);
}
}
const_ITERATE(TMetaVector, mi, domain.metas)
if (!exists(discretizedMetas.begin(), discretizedMetas.end(), (*mi).id))
newDomain->metas.push_back(*mi);
newWeight = weightID;
return mlnew TExampleTable(newDomain, gen);
}
TImputeClassifier::TImputeClassifier(PVariable newVar, PVariable oldVar)
: TClassifier(newVar),
classifierFromVar(mlnew TClassifierFromVar(newVar, oldVar))
{}
TImputeClassifier::TImputeClassifier(const TImputeClassifier &old)
: TClassifier(old),
classifierFromVar(old.classifierFromVar),
imputer(old.imputer)
{}
TValue TImputeClassifier::operator ()(const TExample &ex)
{
checkProperty(classifierFromVar);
checkProperty(imputer);
const TValue res = classifierFromVar->call(ex);
return res.isSpecial() ? imputer->call(ex) : res;
}
PExampleGenerator TPreprocessor_imputeByLearner::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
checkProperty(learner);
TDomain &domain = gen->domain.getReference();
// determine the attributes with unknown values
vector<int> knowns;
for(int i = 0, e = domain.attributes->size(); i<e; i++)
knowns.push_back(i);
vector<int> unknowns;
PEITERATE(ei, gen) {
for(int rei = 1; rei--; )
ITERATE(vector<int>, ui, knowns)
if ((*ei)[*ui].isSpecial()) {
unknowns.push_back(*ui);
knowns.erase(ui);
rei = 1;
break; // break out of this ITERATE since the vector has changed, but set rei to 1 to enter it once again...
}
if (!knowns.size())
break;
}
TVarList newVars = domain.attributes.getReference();
TVarList::iterator nvi(newVars.begin());
ITERATE(vector<int>, ki, unknowns) {
PVariable &oldVar = domain.attributes->at(*ki);
PVariable newVar = CLONE(TVariable, oldVar);
TVarList learnAttrs = domain.attributes.getReference();
learnAttrs.erase(learnAttrs.begin() + *ki);
PDomain learnDomain = mlnew TDomain(oldVar, learnAttrs);
PExampleGenerator data = mlnew TExampleTable(learnDomain, gen);
TImputeClassifier *imputeClassifier = mlnew TImputeClassifier(newVar, oldVar);
PClassifier wimputeClassifier = imputeClassifier;
imputeClassifier->imputer = learner->call(data, weightID);
newVar->getValueFrom = wimputeClassifier;
newVars[*ki] = newVar;
}
newWeight = weightID;
PDomain newDomain = mlnew TDomain(domain.classVar, newVars);
return mlnew TExampleTable(newDomain, gen);
}
TPreprocessor_filter::TPreprocessor_filter(PFilter filt)
: filter(filt)
{}
PExampleGenerator TPreprocessor_filter::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ checkProperty(filter);
newWeight = weightID;
return filterExamples(filter, gen);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -