⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 preprocessors.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include "random.hpp"

#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "table.hpp"
#include "meta.hpp"


#include "filter.hpp"
#include "trindex.hpp"
#include "spec_gen.hpp"
#include "stladdon.hpp"
#include "tabdelim.hpp"
#include "discretize.hpp"
#include "classfromvar.hpp"
#include "cost.hpp"
#include "learn.hpp"

#include <string>
#include "preprocessors.ppp"

DEFINE_TOrangeMap_classDescription(TOrangeMap_KV, PVariable, PValueFilter, "VariableFilterMap")
DEFINE_TOrangeMap_classDescription(TOrangeMap_K, PVariable, float, "VariableFloatMap")

#ifdef _MSC_VER
  #pragma warning (disable : 4100) // unreferenced local parameter (macros name all arguments)
#endif


PExampleGenerator TPreprocessor::filterExamples(PFilter filter, PExampleGenerator generator)
{ TFilteredGenerator fg(filter, generator);
  return PExampleGenerator(mlnew TExampleTable(PExampleGenerator(fg))); 
}


PBoolList TPreprocessor::filterSelectionVector(PFilter filter, PExampleGenerator generator)
{
  TBoolList *selection = new TBoolList;
  PBoolList pselection = selection;

  const int nex = generator->numberOfExamples();
  if (nex > 0)
    selection->reserve(nex);

  TFilter &filt = filter.getReference();
  PEITERATE(ei, generator)
    selection->push_back(filt(*ei));

  return pselection;
}


PBoolList TPreprocessor::selectionVector(PExampleGenerator, const int &)
{ 
  raiseError("this class doesn't support method 'selectionVector'");
  return NULL;
}


TPreprocessor_ignore::TPreprocessor_ignore()
: attributes(mlnew TVarList())
{}


TPreprocessor_ignore::TPreprocessor_ignore(PVarList attrs)
: attributes(attrs)
{}


PExampleGenerator TPreprocessor_ignore::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  PDomain outDomain = CLONE(TDomain, gen->domain);
  PITERATE(TVarList, vi, attributes)
    if (!outDomain->delVariable(*vi))
      if (*vi == outDomain->classVar)
        outDomain->removeClass();
      else
        raiseError("attribute '%s' not found", (*vi)->name.c_str());

  newWeight = weightID;
  return PExampleGenerator(mlnew TExampleTable(outDomain, gen));
}



TPreprocessor_select::TPreprocessor_select()
: attributes(mlnew TVarList())
{}


TPreprocessor_select::TPreprocessor_select(PVarList attrs)
: attributes(attrs)
{}


PExampleGenerator TPreprocessor_select::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  PDomain outDomain = CLONE(TDomain, gen->domain);
  TVarList::const_iterator bi(attributes->begin()), be(attributes->end());

  PITERATE(TVarList, vi, gen->domain->attributes)
    if (find(bi, be, *vi)==be)
      outDomain->delVariable(*vi);

  if (find(bi, be, outDomain->classVar) == be)
    outDomain->removeClass();

  newWeight = weightID;
  return PExampleGenerator(mlnew TExampleTable(outDomain, gen));
}




PFilter TPreprocessor_take::constructFilter(PVariableFilterMap values, PDomain dom, bool conj, bool negate)
{ 
  TValueFilterList *dropvalues = mlnew TValueFilterList();
  PValueFilterList wdropvalues = dropvalues;
  const TDomain &domain = dom.getReference();
  PITERATE(TVariableFilterMap, vi, values) {
    TValueFilter *vf = CLONE(TValueFilter, (*vi).second);
    dropvalues->push_back(vf); // this wraps it!
    vf->position = domain.getVarNum((*vi).first);
  }

  return mlnew TFilter_values(wdropvalues, conj, negate, dom);
}



TPreprocessor_take::TPreprocessor_take()
: values(mlnew TVariableFilterMap()),
  conjunction(true)
{}


TPreprocessor_take::TPreprocessor_take(PVariableFilterMap avalues, bool aconj)
: values(avalues),
  conjunction(aconj)
{}


PExampleGenerator TPreprocessor_take::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ 
  newWeight = weightID;
  return filterExamples(constructFilter(values, gen->domain, conjunction, false), gen);
}


PBoolList TPreprocessor_take::selectionVector(PExampleGenerator gen, const int &)
{ 
  return filterSelectionVector(constructFilter(values, gen->domain, conjunction, false), gen);
}



TPreprocessor_drop::TPreprocessor_drop()
: values(mlnew TVariableFilterMap()),
  conjunction(true)
{}


TPreprocessor_drop::TPreprocessor_drop(PVariableFilterMap avalues, bool aconj)
: values(avalues),
  conjunction(aconj)
{}


PExampleGenerator TPreprocessor_drop::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ 
  newWeight = weightID;
  return filterExamples(TPreprocessor_take::constructFilter(values, gen->domain, conjunction, true), gen);
}

  
PBoolList TPreprocessor_drop::selectionVector(PExampleGenerator gen, const int &)
{ 
  return filterSelectionVector(TPreprocessor_take::constructFilter(values, gen->domain, conjunction, true), gen);
}




PExampleGenerator TPreprocessor_removeDuplicates::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ PExampleGenerator table = mlnew TExampleTable(gen);

  if (weightID)
    newWeight = weightID;
  else {
    newWeight = getMetaID();
    table.AS(TExampleTable)->addMetaAttribute(newWeight, TValue(float(1.0)));
  }

  table.AS(TExampleTable)->removeDuplicates(newWeight);
  return table;
}



PExampleGenerator TPreprocessor_dropMissing::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
  return filterExamples(mlnew TFilter_hasSpecial(true), gen);
}


PBoolList TPreprocessor_dropMissing::selectionVector(PExampleGenerator gen, const int &)
{ 
  return filterSelectionVector(mlnew TFilter_hasSpecial(true), gen);
}


PExampleGenerator TPreprocessor_takeMissing::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
  return filterExamples(mlnew TFilter_hasSpecial(false), gen);
}


PBoolList TPreprocessor_takeMissing::selectionVector(PExampleGenerator gen, const int &)
{ 
  return filterSelectionVector(mlnew TFilter_hasSpecial(false), gen);
}


PExampleGenerator TPreprocessor_dropMissingClasses::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
  return filterExamples(mlnew TFilter_hasClassValue(false), gen);
}


PBoolList TPreprocessor_dropMissingClasses::selectionVector(PExampleGenerator gen, const int &)
{ 
  return filterSelectionVector(mlnew TFilter_hasClassValue(false), gen);
}


PExampleGenerator TPreprocessor_takeMissingClasses::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ newWeight = weightID;
  return filterExamples(mlnew TFilter_hasClassValue(true), gen);
}


PBoolList TPreprocessor_takeMissingClasses::selectionVector(PExampleGenerator gen, const int &)
{ 
  return filterSelectionVector(mlnew TFilter_hasClassValue(true), gen);
}



void addNoise(const int &index, const float &proportion, TMakeRandomIndicesN &mri, TExampleTable *table)
{ 
  const int nvals = table->domain->variables->at(index)->noOfValues();
  const int N = table->size();
  const int changed = N*proportion;
  const int cdiv = (changed+(nvals-1)) / nvals;
  mri.p = mlnew TFloatList(nvals, cdiv);
  
  PLongList rind(mri(N));
  TLongList::const_iterator ri(rind->begin());
  PEITERATE(ei, table) {
    if (*ri < nvals)
        (*ei)[index] = TValue(int(*ri));
    ri++;
  }
}


TPreprocessor_addClassNoise::TPreprocessor_addClassNoise(const float &cn)
: proportion(cn)
{}


PExampleGenerator TPreprocessor_addClassNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  if (!gen->domain->classVar)
    raiseError("Class-less domain");
  if (gen->domain->classVar->varType != TValue::INTVAR)
    raiseError("Discrete class value expected");
  if ((proportion<0.0) || (proportion>1.0))
    raiseError("invalid 'proportion'");

  TExampleTable *table = mlnew TExampleTable(gen);
  PExampleGenerator wtable = table;

  if (proportion>0.0) {
    TMakeRandomIndicesN mri;
    mri.randomGenerator = randomGenerator ? randomGenerator : mlnew TRandomGenerator;
    addNoise(table->domain->attributes->size(), proportion, mri, table);
  }

  newWeight = weightID;
  return wtable;
}



TPreprocessor_addNoise::TPreprocessor_addNoise()
: proportions(mlnew TVariableFloatMap()),
  defaultProportion(0.0)
{}


TPreprocessor_addNoise::TPreprocessor_addNoise(PVariableFloatMap probs, const float &defprob)
: proportions(probs),
  defaultProportion(defprob)  
{}



// props should be initialized to length of domain.attributes->size(), with defaultProportions
void getProportions(PVariableFloatMap &proportions, const TDomain &domain, vector<float> &props)
{
  if (proportions) {
    PITERATE(TVariableFloatMap, vi, proportions) {
      const int idx = domain.getVarNum((*vi).first);
      // class is included if this is explicitly requested
      if (idx >= props.size())
        props.push_back((*vi).second);
      else
        props[idx] = (*vi).second;
    }
  }
}



PExampleGenerator TPreprocessor_addNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ 
  newWeight = weightID;

  if (!proportions && (defaultProportion<=0.0))
    return mlnew TExampleTable(gen);

  const TDomain &domain = gen->domain.getReference();

  TExampleTable *table = mlnew TExampleTable(gen);
  PExampleGenerator wtable = table;

  // We mustn't allow MakeRandomIndicesN to initalize a new generator each time it's called since we'd than always select the same examples
  const int n = table->size();
  PRandomGenerator rg = randomGenerator ? randomGenerator : mlnew TRandomGenerator;
  TMakeRandomIndicesN makerind;
  makerind.randomGenerator = rg;

  // this will not assign the defaultProportion to the class
  vector<float> props(domain.attributes->size(), defaultProportion > 0.0 ? defaultProportion : 0.0);
  getProportions(proportions, domain, props);

  int idx = 0;
  vector<float>::const_iterator pi(props.begin()), pe(props.end());
  for(; pi!=pe; pi++, idx++) {
    if (*pi > 0.0) {
      const PVariable &var = domain.variables->at(idx);
      if (var->varType != TValue::INTVAR)
        raiseError("Cannot add noise to non-discrete attribute '%s'", var->name.c_str());
      addNoise(idx, *pi, makerind, table);
    }
  }

  return wtable;
} 



TPreprocessor_addGaussianNoise::TPreprocessor_addGaussianNoise()
: deviations(mlnew TVariableFloatMap()),
  defaultDeviation(0.0)
{}



TPreprocessor_addGaussianNoise::TPreprocessor_addGaussianNoise(PVariableFloatMap devs, const float &defdev)
: deviations(devs),
  defaultDeviation(defdev)  
{}


int cmp1st(const pair<int, float> &o1, const pair<int, float> &o2)
{
  return o1.first < o2.first;
}

/* For Gaussian noise we use TGaussianNoiseGenerator; the advantage against going
   attribute by attribute (like in addNoise) is that it might require less paging
   on huge datasets. */
PExampleGenerator TPreprocessor_addGaussianNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ 
  newWeight = weightID;

  if (!deviations && (defaultDeviation<=0.0))
    return mlnew TExampleTable(gen);

  const TDomain &domain = gen->domain.getReference();
  vector<pair<int, float> > ps;
  vector<bool> attributeUsed(domain.attributes->size(), false);
  

  if (deviations)
    PITERATE(TVariableFloatMap, vi, deviations) {
      PVariable var = (*vi).first;
      if (var->varType != TValue::FLOATVAR)
        raiseError("attribute '%s' is not continuous", var->name.c_str());

      const int pos = domain.getVarNum(var);
      ps.push_back(pair<int, float>(pos, (*vi).second));

      if ((pos >= 0) && (pos < attributeUsed.size()))
        attributeUsed[pos] = true;
    }
  
  if (defaultDeviation) {
    TVarList::const_iterator vi(domain.attributes->begin());
    const vector<bool>::const_iterator bb = attributeUsed.begin();
    const_ITERATE(vector<bool>, bi, attributeUsed) {
      if (!*bi && ((*vi)->varType == TValue::FLOATVAR))
        ps.push_back(pair<int, float>(bi-bb, defaultDeviation));
      vi++;
    }
  }

  sort(ps.begin(), ps.end(), cmp1st);
  TGaussianNoiseGenerator gg = TGaussianNoiseGenerator(ps, gen, randomGenerator);
  return PExampleGenerator(mlnew TExampleTable(PExampleGenerator(gg)));
}



TPreprocessor_addMissing::TPreprocessor_addMissing()
: proportions(mlnew TVariableFloatMap()),
  defaultProportion(0.0),
  specialType(valueDK)
{}


TPreprocessor_addMissing::TPreprocessor_addMissing(PVariableFloatMap probs, const float &defprob, const int &st)
: proportions(probs),
  defaultProportion(defprob),
  specialType(st)
{}


PExampleGenerator TPreprocessor_addMissing::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  newWeight = weightID;

  if (!proportions && (defaultProportion<=0.0))
    return mlnew TExampleTable(gen);

  const TDomain &domain = gen->domain.getReference();

  TExampleTable *table = mlnew TExampleTable(gen);
  PExampleGenerator wtable = table;

  // We mustn't allow MakeRandomIndices2 to initalize a new generator each time it's called since we'd than always select the same examples
  const int n = table->size();
  TMakeRandomIndices2 makerind;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -