📄 preprocessors.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
*/
#include "random.hpp"
#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "table.hpp"
#include "meta.hpp"
#include "filter.hpp"
#include "trindex.hpp"
#include "spec_gen.hpp"
#include "stladdon.hpp"
#include "tabdelim.hpp"
#include "discretize.hpp"
#include "classfromvar.hpp"
#include "cost.hpp"
#include "learn.hpp"
#include <string>
#include "preprocessors.ppp"
DEFINE_TOrangeMap_classDescription(TOrangeMap_KV, PVariable, PValueFilter, "VariableFilterMap")
DEFINE_TOrangeMap_classDescription(TOrangeMap_K, PVariable, float, "VariableFloatMap")
#ifdef _MSC_VER
#pragma warning (disable : 4100) // unreferenced local parameter (macros name all arguments)
#endif
PExampleGenerator TPreprocessor::filterExamples(PFilter filter, PExampleGenerator generator)
{ TFilteredGenerator fg(filter, generator);
return PExampleGenerator(mlnew TExampleTable(PExampleGenerator(fg)));
}
PBoolList TPreprocessor::filterSelectionVector(PFilter filter, PExampleGenerator generator)
{
TBoolList *selection = new TBoolList;
PBoolList pselection = selection;
const int nex = generator->numberOfExamples();
if (nex > 0)
selection->reserve(nex);
TFilter &filt = filter.getReference();
PEITERATE(ei, generator)
selection->push_back(filt(*ei));
return pselection;
}
PBoolList TPreprocessor::selectionVector(PExampleGenerator, const int &)
{
raiseError("this class doesn't support method 'selectionVector'");
return NULL;
}
TPreprocessor_ignore::TPreprocessor_ignore()
: attributes(mlnew TVarList())
{}
TPreprocessor_ignore::TPreprocessor_ignore(PVarList attrs)
: attributes(attrs)
{}
PExampleGenerator TPreprocessor_ignore::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
PDomain outDomain = CLONE(TDomain, gen->domain);
PITERATE(TVarList, vi, attributes)
if (!outDomain->delVariable(*vi))
if (*vi == outDomain->classVar)
outDomain->removeClass();
else
raiseError("attribute '%s' not found", (*vi)->name.c_str());
newWeight = weightID;
return PExampleGenerator(mlnew TExampleTable(outDomain, gen));
}
TPreprocessor_select::TPreprocessor_select()
: attributes(mlnew TVarList())
{}
TPreprocessor_select::TPreprocessor_select(PVarList attrs)
: attributes(attrs)
{}
PExampleGenerator TPreprocessor_select::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
PDomain outDomain = CLONE(TDomain, gen->domain);
TVarList::const_iterator bi(attributes->begin()), be(attributes->end());
PITERATE(TVarList, vi, gen->domain->attributes)
if (find(bi, be, *vi)==be)
outDomain->delVariable(*vi);
if (find(bi, be, outDomain->classVar) == be)
outDomain->removeClass();
newWeight = weightID;
return PExampleGenerator(mlnew TExampleTable(outDomain, gen));
}
PFilter TPreprocessor_take::constructFilter(PVariableFilterMap values, PDomain dom, bool conj, bool negate)
{
TValueFilterList *dropvalues = mlnew TValueFilterList();
PValueFilterList wdropvalues = dropvalues;
const TDomain &domain = dom.getReference();
PITERATE(TVariableFilterMap, vi, values) {
TValueFilter *vf = CLONE(TValueFilter, (*vi).second);
dropvalues->push_back(vf); // this wraps it!
vf->position = domain.getVarNum((*vi).first);
}
return mlnew TFilter_values(wdropvalues, conj, negate, dom);
}
TPreprocessor_take::TPreprocessor_take()
: values(mlnew TVariableFilterMap()),
conjunction(true)
{}
TPreprocessor_take::TPreprocessor_take(PVariableFilterMap avalues, bool aconj)
: values(avalues),
conjunction(aconj)
{}
PExampleGenerator TPreprocessor_take::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
newWeight = weightID;
return filterExamples(constructFilter(values, gen->domain, conjunction, false), gen);
}
PBoolList TPreprocessor_take::selectionVector(PExampleGenerator gen, const int &)
{
return filterSelectionVector(constructFilter(values, gen->domain, conjunction, false), gen);
}
TPreprocessor_drop::TPreprocessor_drop()
: values(mlnew TVariableFilterMap()),
conjunction(true)
{}
TPreprocessor_drop::TPreprocessor_drop(PVariableFilterMap avalues, bool aconj)
: values(avalues),
conjunction(aconj)
{}
PExampleGenerator TPreprocessor_drop::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
newWeight = weightID;
return filterExamples(TPreprocessor_take::constructFilter(values, gen->domain, conjunction, true), gen);
}
PBoolList TPreprocessor_drop::selectionVector(PExampleGenerator gen, const int &)
{
return filterSelectionVector(TPreprocessor_take::constructFilter(values, gen->domain, conjunction, true), gen);
}
PExampleGenerator TPreprocessor_removeDuplicates::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ PExampleGenerator table = mlnew TExampleTable(gen);
if (weightID)
newWeight = weightID;
else {
newWeight = getMetaID();
table.AS(TExampleTable)->addMetaAttribute(newWeight, TValue(float(1.0)));
}
table.AS(TExampleTable)->removeDuplicates(newWeight);
return table;
}
PExampleGenerator TPreprocessor_dropMissing::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
return filterExamples(mlnew TFilter_hasSpecial(true), gen);
}
PBoolList TPreprocessor_dropMissing::selectionVector(PExampleGenerator gen, const int &)
{
return filterSelectionVector(mlnew TFilter_hasSpecial(true), gen);
}
PExampleGenerator TPreprocessor_takeMissing::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
return filterExamples(mlnew TFilter_hasSpecial(false), gen);
}
PBoolList TPreprocessor_takeMissing::selectionVector(PExampleGenerator gen, const int &)
{
return filterSelectionVector(mlnew TFilter_hasSpecial(false), gen);
}
PExampleGenerator TPreprocessor_dropMissingClasses::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
return filterExamples(mlnew TFilter_hasClassValue(false), gen);
}
PBoolList TPreprocessor_dropMissingClasses::selectionVector(PExampleGenerator gen, const int &)
{
return filterSelectionVector(mlnew TFilter_hasClassValue(false), gen);
}
PExampleGenerator TPreprocessor_takeMissingClasses::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
return filterExamples(mlnew TFilter_hasClassValue(true), gen);
}
PBoolList TPreprocessor_takeMissingClasses::selectionVector(PExampleGenerator gen, const int &)
{
return filterSelectionVector(mlnew TFilter_hasClassValue(true), gen);
}
void addNoise(const int &index, const float &proportion, TMakeRandomIndicesN &mri, TExampleTable *table)
{
const int nvals = table->domain->variables->at(index)->noOfValues();
const int N = table->size();
const int changed = N*proportion;
const int cdiv = (changed+(nvals-1)) / nvals;
mri.p = mlnew TFloatList(nvals, cdiv);
PLongList rind(mri(N));
TLongList::const_iterator ri(rind->begin());
PEITERATE(ei, table) {
if (*ri < nvals)
(*ei)[index] = TValue(int(*ri));
ri++;
}
}
TPreprocessor_addClassNoise::TPreprocessor_addClassNoise(const float &cn)
: proportion(cn)
{}
PExampleGenerator TPreprocessor_addClassNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
if (!gen->domain->classVar)
raiseError("Class-less domain");
if (gen->domain->classVar->varType != TValue::INTVAR)
raiseError("Discrete class value expected");
if ((proportion<0.0) || (proportion>1.0))
raiseError("invalid 'proportion'");
TExampleTable *table = mlnew TExampleTable(gen);
PExampleGenerator wtable = table;
if (proportion>0.0) {
TMakeRandomIndicesN mri;
mri.randomGenerator = randomGenerator ? randomGenerator : mlnew TRandomGenerator;
addNoise(table->domain->attributes->size(), proportion, mri, table);
}
newWeight = weightID;
return wtable;
}
TPreprocessor_addNoise::TPreprocessor_addNoise()
: proportions(mlnew TVariableFloatMap()),
defaultProportion(0.0)
{}
TPreprocessor_addNoise::TPreprocessor_addNoise(PVariableFloatMap probs, const float &defprob)
: proportions(probs),
defaultProportion(defprob)
{}
// props should be initialized to length of domain.attributes->size(), with defaultProportions
void getProportions(PVariableFloatMap &proportions, const TDomain &domain, vector<float> &props)
{
if (proportions) {
PITERATE(TVariableFloatMap, vi, proportions) {
const int idx = domain.getVarNum((*vi).first);
// class is included if this is explicitly requested
if (idx >= props.size())
props.push_back((*vi).second);
else
props[idx] = (*vi).second;
}
}
}
PExampleGenerator TPreprocessor_addNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
newWeight = weightID;
if (!proportions && (defaultProportion<=0.0))
return mlnew TExampleTable(gen);
const TDomain &domain = gen->domain.getReference();
TExampleTable *table = mlnew TExampleTable(gen);
PExampleGenerator wtable = table;
// We mustn't allow MakeRandomIndicesN to initalize a new generator each time it's called since we'd than always select the same examples
const int n = table->size();
PRandomGenerator rg = randomGenerator ? randomGenerator : mlnew TRandomGenerator;
TMakeRandomIndicesN makerind;
makerind.randomGenerator = rg;
// this will not assign the defaultProportion to the class
vector<float> props(domain.attributes->size(), defaultProportion > 0.0 ? defaultProportion : 0.0);
getProportions(proportions, domain, props);
int idx = 0;
vector<float>::const_iterator pi(props.begin()), pe(props.end());
for(; pi!=pe; pi++, idx++) {
if (*pi > 0.0) {
const PVariable &var = domain.variables->at(idx);
if (var->varType != TValue::INTVAR)
raiseError("Cannot add noise to non-discrete attribute '%s'", var->name.c_str());
addNoise(idx, *pi, makerind, table);
}
}
return wtable;
}
TPreprocessor_addGaussianNoise::TPreprocessor_addGaussianNoise()
: deviations(mlnew TVariableFloatMap()),
defaultDeviation(0.0)
{}
TPreprocessor_addGaussianNoise::TPreprocessor_addGaussianNoise(PVariableFloatMap devs, const float &defdev)
: deviations(devs),
defaultDeviation(defdev)
{}
int cmp1st(const pair<int, float> &o1, const pair<int, float> &o2)
{
return o1.first < o2.first;
}
/* For Gaussian noise we use TGaussianNoiseGenerator; the advantage against going
attribute by attribute (like in addNoise) is that it might require less paging
on huge datasets. */
PExampleGenerator TPreprocessor_addGaussianNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
newWeight = weightID;
if (!deviations && (defaultDeviation<=0.0))
return mlnew TExampleTable(gen);
const TDomain &domain = gen->domain.getReference();
vector<pair<int, float> > ps;
vector<bool> attributeUsed(domain.attributes->size(), false);
if (deviations)
PITERATE(TVariableFloatMap, vi, deviations) {
PVariable var = (*vi).first;
if (var->varType != TValue::FLOATVAR)
raiseError("attribute '%s' is not continuous", var->name.c_str());
const int pos = domain.getVarNum(var);
ps.push_back(pair<int, float>(pos, (*vi).second));
if ((pos >= 0) && (pos < attributeUsed.size()))
attributeUsed[pos] = true;
}
if (defaultDeviation) {
TVarList::const_iterator vi(domain.attributes->begin());
const vector<bool>::const_iterator bb = attributeUsed.begin();
const_ITERATE(vector<bool>, bi, attributeUsed) {
if (!*bi && ((*vi)->varType == TValue::FLOATVAR))
ps.push_back(pair<int, float>(bi-bb, defaultDeviation));
vi++;
}
}
sort(ps.begin(), ps.end(), cmp1st);
TGaussianNoiseGenerator gg = TGaussianNoiseGenerator(ps, gen, randomGenerator);
return PExampleGenerator(mlnew TExampleTable(PExampleGenerator(gg)));
}
TPreprocessor_addMissing::TPreprocessor_addMissing()
: proportions(mlnew TVariableFloatMap()),
defaultProportion(0.0),
specialType(valueDK)
{}
TPreprocessor_addMissing::TPreprocessor_addMissing(PVariableFloatMap probs, const float &defprob, const int &st)
: proportions(probs),
defaultProportion(defprob),
specialType(st)
{}
PExampleGenerator TPreprocessor_addMissing::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
newWeight = weightID;
if (!proportions && (defaultProportion<=0.0))
return mlnew TExampleTable(gen);
const TDomain &domain = gen->domain.getReference();
TExampleTable *table = mlnew TExampleTable(gen);
PExampleGenerator wtable = table;
// We mustn't allow MakeRandomIndices2 to initalize a new generator each time it's called since we'd than always select the same examples
const int n = table->size();
TMakeRandomIndices2 makerind;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -