⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 imputation.cpp

📁 orange源码 数据挖掘技术
💻 CPP
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include "vars.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "classify.hpp"
#include "learn.hpp"
#include "basstat.hpp"
#include "table.hpp"
#include "lookup.hpp"
#include "classfromvar.hpp"

#include "imputation.ppp"

WRAPPER(Classifier)

void TTransformValue_IsDefined::transform(TValue &val)
{
  val = TValue(val.isSpecial() ? 1 : 0);
}

PExampleGenerator TImputer::operator()(PExampleGenerator gen, const int &weightID)
{
  if (!gen)
    return PExampleGenerator();
  if (!gen->numberOfExamples())
    return mlnew TExampleTable(gen->domain);

  TExample *imputedExample = call(*gen->begin());
  TExampleTable *newtable = mlnew TExampleTable(imputedExample->domain);
  PExampleGenerator newgen = newtable;
  mldelete imputedExample;

  PEITERATE(ei, gen)
    newtable->addExample(call(*ei));

  return newgen;
}


void TImputer::imputeDefaults(TExample *example, PExample defaults)
{ 
  if (example->domain != defaults->domain)
    raiseError("invalid domain");

  try {
    TExample::const_iterator ei(defaults->begin());
    TExample::iterator oi(example->begin()), oe(example->end());
    for(; oi!=oe; oi++, ei++)
      if ((*oi).isSpecial() && !(*ei).isSpecial())
        *oi = *ei;
  }
  catch (...) {
    mldelete example;
    throw;
  }
}

TImputer_defaults::TImputer_defaults(PDomain domain)
: defaults(mlnew TExample(domain))
{}


TImputer_defaults::TImputer_defaults(PExample example)
: defaults(example)
{}


TImputer_defaults::TImputer_defaults(const TExample &valu)
: defaults(mlnew TExample(valu))
{}


TExample *TImputer_defaults::operator()(TExample &example)
{
  checkProperty(defaults);
  TExample *imputed = CLONE(TExample, &example);
  imputeDefaults(imputed, defaults);
  return imputed;
};


TExample *TImputer_asValue::operator()(TExample &example)
{ 
  checkProperty(domain);
  TExample *imputed = mlnew TExample(domain, example);
  if (defaults)
    imputeDefaults(imputed, defaults);
  return imputed;
}


TExample *TImputer_model::operator ()(TExample &example)
{
  checkProperty(models);

  if (models->size() != example.domain->variables->size())
    raiseError("wrong domain (invalid size)");

  TExample *imputed = CLONE(TExample, &example);

  try {
    TExample::iterator ei(imputed->begin()), eie(imputed->end());
    TClassifierList::iterator mi(models->begin()), me(models->end());
    TVarList::const_iterator di(example.domain->variables->begin());
    for(; (ei!=eie) && (mi!=me); ei++, mi++, di++) {
      if ((*ei).isSpecial() && *mi) {
        if ((*mi)->classVar) {
          if ((*mi)->classVar != *di)
            raiseError("wrong domain (wrong model for '%s')", (*di)->name.c_str());
          *ei = (*mi)->call(example);
        }
        else {
          TValue val = (*mi)->call(example);
          if (val.varType != (*di)->varType)
            raiseError("wrong domain (wrong model for '%s')", (*di)->name.c_str());
          *ei = val;
        }
      }
    }
  }
  catch (...) {
    mldelete imputed;
    throw;
  }
  return imputed;
}



TImputer_random::TImputer_random(const bool ic, const bool dete, PDistributionList dist)
: imputeClass(ic),
  deterministic(dete),
  distributions(dist)
{}

TExample *TImputer_random::operator()(TExample &example)
{
  TExample *imputed = CLONE(TExample, &example);

  bool initialized = !deterministic; // if deterministic, randgen is initialized with crc32 for each exapmle
  TVarList::iterator vi(imputed->domain->variables->begin()), ve(imputed->domain->variables->end());
  if (vi==ve)
    return imputed;
  if (!imputeClass && imputed->domain->classVar) {
    if (vi == --ve)
      return imputed;
  }

  if (!distributions) {
    for(TExample::iterator ei(imputed->begin()); vi!=ve; vi++, ei++)
      if ((*ei).isSpecial()) {
        if (!initialized) {
          randgen.initseed = imputed->sumValues();
          randgen.reset();
          initialized = true;
        }
        *ei = (*vi)->randomValue(randgen.randint());
      }
  }

  else {
    TDistributionList::iterator di(distributions->begin());

    for(TExample::iterator ei(imputed->begin()); vi!=ve; vi++, ei++, di++) {
      if ((*ei).isSpecial()) {
        if (!initialized) {
          randgen.initseed = imputed->sumValues();
          randgen.reset();
          initialized = true;
        }

        if ((*ei).varType == TValue::INTVAR)
          *ei = TValue((*di)->randomInt(randgen.randlong()));
        else
          *ei = TValue((*di)->randomFloat(randgen.randlong()));
      }
    }
  }

  return imputed;
}





TImputerConstructor::TImputerConstructor()
: imputeClass(true)
{}


PImputer TImputerConstructor_defaults::operator()(PExampleGenerator egen, const int &weightID)
{
  return mlnew TImputer_defaults(defaults);
}


PImputer TImputerConstructor_average::operator()(PExampleGenerator egen, const int &weightID)
{
  TImputer_defaults *imputer = mlnew TImputer_defaults(egen->domain);
  PImputer wimputer(imputer);

  TDomainDistributions ddist(egen, weightID);
  TExample::iterator vi(imputer->defaults->begin()), ve(imputer->defaults->end());
  TDomainDistributions::const_iterator di(ddist.begin());
  TVarList::const_iterator doi(egen->domain->variables->begin());
  for(; vi!=ve; vi++, di++, doi++)
    if ((*di)->supportsDiscrete)
      *vi = (*di)->highestProbValue(egen->numberOfExamples());
    else if ((*di)->supportsContinuous)
      *vi = TValue((*di)->percentile(50));
    else
      *vi = TValue((*doi)->DK());

  if (!imputeClass && egen->domain->classVar)
    imputer->defaults->setClass(egen->domain->classVar->DK());
  
  return wimputer;
}


PImputer TImputerConstructor_minimal::operator()(PExampleGenerator egen, const int &weightID)
{
  TImputer_defaults *imputer = mlnew TImputer_defaults(egen->domain);
  PImputer wimputer(imputer);

  TDomainBasicAttrStat basstat(egen, weightID);
  TExample::iterator vi(imputer->defaults->begin()), ve(imputer->defaults->end());
  TDomainBasicAttrStat::const_iterator bi(basstat.begin());
  for(; vi!=ve; vi++, bi++)
    if (*bi)
      *vi = TValue((*bi)->min);
    else
      *vi = TValue(0);

  if (!imputeClass && egen->domain->classVar)
    imputer->defaults->setClass(egen->domain->classVar->DK());
  
  return wimputer;
}


PImputer TImputerConstructor_maximal::operator()(PExampleGenerator egen, const int &weightID)
{
  TImputer_defaults *imputer = mlnew TImputer_defaults(egen->domain);
  PImputer wimputer(imputer);

  TDomainBasicAttrStat basstat(egen, weightID);
  TExample::iterator vi(imputer->defaults->begin()), ve(imputer->defaults->end());
  TDomainBasicAttrStat::const_iterator bi(basstat.begin());
  TVarList::const_iterator di(egen->domain->variables->begin());
  for(; vi!=ve; vi++, bi++, di++)
    if (*bi)
      *vi = TValue((*bi)->max);
    else
      *vi = TValue((*di)->noOfValues()-1);

  if (!imputeClass && egen->domain->classVar)
    imputer->defaults->setClass(egen->domain->classVar->DK());
  
  return wimputer;
}


TTransformValue_IsDefined staticTransform_IsDefined;

PVariable TImputerConstructor_asValue::createImputedVar(PVariable var)
{
  if (var->varType == TValue::INTVAR) {
    TEnumVariable *newvar = mlnew TEnumVariable(var->name);
    PVariable res = newvar;
    newvar->values = mlnew TStringList(var.AS(TEnumVariable)->values.getReference());
    newvar->values->push_back("NA");

    TClassifierByLookupTable1 *cblt = mlnew TClassifierByLookupTable1(newvar, var);
    newvar->getValueFrom = cblt;
    TValueList &table = cblt->lookupTable.getReference();
    for(int i = 0, e = table.size(); i!=e; i++)
      table[i] = TValue(i);

    return res;
  }

  if (var->varType == TValue::FLOATVAR) {
    TEnumVariable *newvar = mlnew TEnumVariable(var->name + "_def");
    PVariable res = newvar;
    newvar->values->push_back("def");
    newvar->values->push_back("undef");

    TClassifierFromVar *cfv = mlnew TClassifierFromVar(newvar, var);
    newvar->getValueFrom = cfv;
    cfv->transformUnknowns = true;

    cfv->transformer = PTransformValue(staticTransform_IsDefined);
    return res;
  }

  return PVariable();
}


PImputer TImputerConstructor_asValue::operator ()(PExampleGenerator egen, const int &weightID)
{
  PDomain &domain = egen->domain;
  if (imputeClass && domain->classVar && domain->classVar->varType == TValue::FLOATVAR)
    raiseError("This method cannot impute continuous classes");

  bool hasContinuous = false;
  TVarList newVariables;
  PITERATE(TVarList, vi, domain->attributes) {
    PVariable newvar = createImputedVar(*vi);
    if (newvar) {
      newVariables.push_back(newvar);
      if ((*vi)->varType == TValue::FLOATVAR) {
        newVariables.push_back(*vi);
        hasContinuous = true;
      }
    }
    else
      newVariables.push_back(*vi);
  }

  PVariable classVar;
  if (domain->classVar) {
    if (imputeClass)
      createImputedVar(domain->classVar);
    if (!classVar)
      classVar = domain->classVar;
  }

  TImputer_asValue *imputer = mlnew TImputer_asValue;
  PImputer wimputer(imputer);
  imputer->domain = mlnew TDomain(classVar, newVariables);

  if (hasContinuous) {
    imputer->defaults = mlnew TExample(imputer->domain);
    TDomainBasicAttrStat basstat(egen, weightID);
    TExample::iterator aei(imputer->defaults->begin());
    ITERATE(TDomainBasicAttrStat, bi, basstat) {
      aei++;
      if (*bi)
        *(aei++) = TValue((*bi)->avg);
    }        
  }

  return wimputer;
}

TImputerConstructor_model::TImputerConstructor_model()
: useClass(false)
{}


PImputer TImputerConstructor_model::operator()(PExampleGenerator egen, const int &weightID)
{
  TImputer_model *imputer = mlnew TImputer_model;
  PImputer wimputer(imputer);
  imputer->models = mlnew TClassifierList;

  TVarList vl = egen->domain->variables.getReference();
  if (!useClass && egen->domain->classVar)
    vl.erase(vl.end()-1);
  PVariable tmp;

  ITERATE(TVarList, vli, vl) {
    const int varType = (*vli)->varType;
    if (   (varType == TValue::INTVAR) && learnerDiscrete
        || (varType == TValue::FLOATVAR) && learnerContinuous) {
      tmp = *vli; *vli = vl.back(); vl.back() = tmp;
      PDomain newdomain = mlnew TDomain(vl);
      PExampleGenerator newgen = mlnew TExampleTable(newdomain, egen);
      imputer->models->push_back((varType == TValue::INTVAR ? learnerDiscrete : learnerContinuous)->call(newgen, weightID));
      tmp = *vli; *vli = vl.back(); vl.back() = tmp;
    }
    else
      imputer->models->push_back(PClassifier());
  }

  if (egen->domain->classVar) {
    const int varType = egen->domain->classVar->varType;
    if (imputeClass &&
       (   (varType == TValue::INTVAR) && learnerDiscrete
        || (varType == TValue::FLOATVAR) && learnerContinuous))
      imputer->models->push_back((varType == TValue::INTVAR ? learnerDiscrete : learnerContinuous)->call(egen, weightID));
    else
      imputer->models->push_back(PClassifier());
  }

  return wimputer;
}



TImputerConstructor_random::TImputerConstructor_random(const bool dete)
: deterministic(dete)
{}


PImputer TImputerConstructor_random::operator()(PExampleGenerator egen, const int &weightID)
{
  PDomainBasicAttrStat dbas;
  TDomainBasicAttrStat::const_iterator dbi;
  if (egen->domain->hasContinuousAttributes(true)) {
    dbas = new TDomainBasicAttrStat(egen, weightID);
    dbi = dbas->begin();
  }

  
  PDomainDistributions ddist;
  TDomainDistributions::const_iterator ddi;
  if (egen->domain->hasDiscreteAttributes(true)) {
    ddist = new TDomainDistributions(egen, weightID, false, true);
    ddi = ddist->begin();
  }

  PDistributionList distributions = new TDistributionList();

  PITERATE(TVarList, vi, egen->domain->variables) {
    if ((*vi)->varType == TValue::INTVAR)
      distributions->push_back(*ddi);
    else
      distributions->push_back(new TGaussianDistribution((*dbi)->avg, (*dbi)->dev));
    if (dbas)
      dbi++;
    if (ddist)
      ddi++;
  }

  return mlnew TImputer_random(imputeClass, deterministic, distributions);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -