⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 redundancy.cpp

📁 orange源码 数据挖掘技术
💻 CPP
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include <queue>
#include "stladdon.hpp"

#include "random.hpp"

#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "table.hpp"

#include "contingency.hpp"
#include "discretize.hpp"
#include "classfromvar.hpp"
#include "lookup.hpp"
#include "induce.hpp"
#include "measures.hpp"

#include "redundancy.ppp"


TRemoveRedundant::TRemoveRedundant(bool akeepValues)
: keepValues(akeepValues)
{}


TRemoveRedundantByInduction::TRemoveRedundantByInduction(bool akeepValues)
: TRemoveRedundant(akeepValues)
{}


class T_IntMeasure {
public:
  int attrNo;
  float measure;
  T_IntMeasure(const int &i, const float &m) 
  : attrNo(i), 
    measure(m)
  {};

  inline bool operator < (const T_IntMeasure &o) const
  { return measure>o.measure; }
};


PDomain TRemoveRedundantByInduction::operator()(PExampleGenerator gen, PVarList suspicious, PExampleGenerator *NRGen, int weightID)
{ TExampleTable *newGen = NULL;
  try {
    if (measure->needs==TMeasureAttribute::Generator) {
      newGen=mlnew TExampleTable(gen);

      TVarList candidates;
      if (suspicious && suspicious->size()) {
        PITERATE(TVarList, si, suspicious)
          if (exists(newGen->domain->attributes.getReference(), *si))
            candidates.push_back(*si);
          else
            PITERATE(TVarList, ni, newGen->domain->attributes)
              if (   (*ni)->getValueFrom
                  && (*ni)->getValueFrom.is_derived_from(TClassifierFromVar)
                  && (*ni)->getValueFrom.AS(TClassifierFromVar)->whichVar==*si)
                candidates.push_back(*ni);
      }
      else 
        candidates = newGen->domain->attributes.getReference();

      for(bool doMore = true; doMore; ) {
        priority_queue<T_IntMeasure> measurements;
        int ano=0;
        for(TVarList::iterator vi(newGen->domain->attributes->begin()), ve(newGen->domain->attributes->end());
            vi!=ve;
            vi++, ano++)
          if (find(candidates.begin(), candidates.end(), *vi)!=candidates.end()) {
            T_IntMeasure meas(ano, measure->operator ()(ano, gen, PDistribution(), weightID));
            measurements.push(meas);
          }

        for(doMore = false; !doMore && !measurements.empty(); measurements.pop()) {
          PVariable attr = newGen->domain->attributes->at(measurements.top().attrNo);
          PVariable newVar;

          if (attr->noOfValues()==1)
            newVar=attr;
          else {
            TVarList boundSet;
            boundSet.push_back(attr);
            float foo;
            newVar = featureReducer->operator()(PExampleGenerator(*newGen), boundSet, attr->name+"_r", foo);
          }

          candidates.erase(remove(candidates.begin(), candidates.end(), attr), candidates.end());

          if (   newVar
              && (   (newVar->noOfValues()==1)
                  || (!keepValues && (newVar->noOfValues() < attr->noOfValues())))) {
            PDomain newDomain = CLONE(TDomain, newGen->domain);
            newDomain->delVariable(attr);
            if (newVar->noOfValues()>1)
              newDomain->addVariable(newVar);
            TExampleTable *newNewGen = mlnew TExampleTable(newDomain, PExampleGenerator(*newGen));
            mldelete newGen;
            newGen = newNewGen;
            doMore = true;
          }
        }
      }

      PDomain retDomain = newGen->domain;
      if (NRGen)
        *NRGen = PExampleGenerator(newGen);
      else {
        mldelete newGen;
        newGen = NULL;
      }
      return retDomain;
    }

    else if (measure->needs==TMeasureAttribute::DomainContingency) {
      raiseError("redundancy removal by attribute measure that needs domain contingency is not implemented yet");
    }

    else {// measure->needs==Contingency_Class
      newGen = mlnew TExampleTable(gen);
      priority_queue<T_IntMeasure> measurements;

      { TDomainContingency cont = TDomainContingency(PExampleGenerator(*newGen), weightID);
        int ano = 0;
        for(TDomainContingency::iterator ci(cont.begin()), ei(cont.end()); ci!=ei; ci++, ano++)
          if (   ((*ci)->outerVariable->varType==TValue::INTVAR)
              && (   !suspicious
                  || !suspicious->size()
                  || exists(suspicious.getReference(), gen->domain->attributes->at(ano))))
            measurements.push(T_IntMeasure(ano, measure->operator ()(*ci, cont.classes)));
      }

      while(!measurements.empty()) {
        int attrNo = measurements.top().attrNo;
        PVariable attr = gen->domain->attributes->at(attrNo);
        TVarList boundSet;
        boundSet.push_back(attr);
        float foo;
        PVariable newVar(featureReducer->operator()(PExampleGenerator(*newGen), boundSet, attr->name+"_r", foo));

        if (   newVar
            && (   (newVar->noOfValues()==1)
                || (!keepValues && (newVar->noOfValues() < attr->noOfValues())))) {
          PDomain newDomain = CLONE(TDomain, newGen->domain);
          newDomain->delVariable(attr);
          if (newVar->noOfValues()>1)
            newDomain->addVariable(newVar);
          TExampleTable *newNewGen = mlnew TExampleTable(newDomain, PExampleGenerator(*newGen));
          mldelete newGen;
          newGen = newNewGen;
        }
        measurements.pop();
      }

      PDomain retDomain = newGen->domain;
      if (NRGen) 
        *NRGen = PExampleGenerator(newGen);
      else
        mldelete newGen;
      return retDomain;
    }
  }
  catch (exception) {
    mldelete newGen;
    throw;
  }

  throw 0;
}



TRemoveRedundantByQuality::TRemoveRedundantByQuality(bool aremeasure)
: TRemoveRedundant(false), 
  remeasure(aremeasure)
{}


PDomain TRemoveRedundantByQuality::operator()
  (PExampleGenerator gen, PVarList suspicious, PExampleGenerator *NRGen, int weightID)
{
  if (!remeasure || (measure->needs==TMeasureAttribute::Contingency_Class)) {
    priority_queue<T_IntMeasure> measurements;

    if (measure->needs==TMeasureAttribute::Generator) {
      int ano = 0;
      for(TVarList::iterator vi(gen->domain->attributes->begin()), ve(gen->domain->attributes->end());
          vi!=ve;
          vi++, ano++)
        if (   (   !suspicious
                || !suspicious->size()
                || (find(suspicious->begin(), suspicious->end(), *vi)!=suspicious->end()))) {
          T_IntMeasure meas(ano, measure->operator ()(ano, gen, PDistribution(), weightID));
          measurements.push(meas);
        }
    }
    else {
      TDomainContingency cont(gen, weightID);
      int ano = 0;
      for(TDomainContingency::iterator ci(cont.begin()), ei(cont.end()); ci!=ei; ci++, ano++)
        if (   ((*ci)->outerVariable->varType==TValue::INTVAR)
            && (   !suspicious
                || !suspicious->size()
                || exists(suspicious.getReference(), gen->domain->attributes->at(ano)))) {
          T_IntMeasure meas(ano, measure->operator ()(*ci, cont.classes));
          measurements.push(meas);
        }
    }

    PDomain newDomain = CLONE(TDomain, gen->domain);
    while(   !measurements.empty()
          && (   (measurements.top().measure<=minQuality)
              || (int(newDomain->attributes->size())>removeBut))) {
      newDomain->delVariable(gen->domain->attributes->at(measurements.top().attrNo));
      measurements.pop();
    }

    if (NRGen)
      *NRGen = mlnew TExampleTable(newDomain, gen);

    return newDomain;
  }

  else if (measure->needs==TMeasureAttribute::DomainContingency) {
    raiseError("redundancy removal by attribute measure that needs domain contingency is not implemented yet");
  }

  else /* if (measure->needs==TMeasureAttribute::Generator) */ {
    TExampleTable *newGen = mlnew TExampleTable(gen);
    PDomain retDomain;
    try {

      TSimpleRandomGenerator srgen(0);

      float bestM = -1.0;
      do {
        int bestAttr = -1, wins=0, attrNo=0;
        TVarList &attributes=newGen->domain->attributes.getReference();
        for(TVarList::iterator vi(attributes.begin()), ve(attributes.end()); vi!=ve; vi++, attrNo++)
          if (   !suspicious
              || !suspicious->size()
              || find(suspicious->begin(), suspicious->end(), *vi)!=suspicious->end()) {
            float thisM = measure->operator ()(attrNo, PExampleGenerator(*newGen), PDistribution(), weightID);
            if (   (!wins || (thisM <bestM)) && ((wins=1)==1)
                || (thisM==bestM) && srgen.randbool(++wins)) {
              bestAttr = attrNo; 
              bestM = thisM; 
            }
          }
        if (!wins)
          break;

        if ((bestM<=minQuality) || (int(attributes.size())>removeBut)) {
          PDomain newDomain = CLONE(TDomain, newGen->domain);
          newDomain->delVariable(attributes[bestAttr]);
          TExampleTable *newNewGen = mlnew TExampleTable(newDomain, PExampleGenerator(*newGen));
          mldelete newGen;
          newGen = newNewGen;
        }
      } while((bestM<=minQuality) || (int(newGen->domain->attributes->size())>removeBut));

      retDomain = newGen->domain;

    }
    catch (exception) {
      mldelete newGen;
      throw;
    }

    if (NRGen)
      *NRGen = newGen;
    else
      mldelete newGen;

    return retDomain;
  }
  throw 0;
}



TRemoveRedundantOneValue::TRemoveRedundantOneValue(bool anOnData)
: TRemoveRedundant(false),
  onData(anOnData)
{}

PDomain TRemoveRedundantOneValue::operator()
  (PExampleGenerator gen, PVarList suspicious, PExampleGenerator *nonRedundantResult, int weightID)
{
  PDomain newDomain = mlnew TDomain;

  if (onData) {
    TDomainDistributions distr(gen, weightID);
    TDomainDistributions::iterator di(distr.begin());
    PITERATE(TVarList, vi, gen->domain->attributes) {
      if (   suspicious && suspicious->size() && !exists(suspicious.getReference(), *vi))
        newDomain->addVariable(*vi);
      else {
        const TDiscDistribution *discdist = (*di).AS(TDiscDistribution);
        if (!discdist)
          newDomain->addVariable(*vi);
        else {
          int nonull=0;
          const_ITERATE(TDiscDistribution, dvi, *discdist)
            if ((*dvi>0) && nonull++)
              break;
          if (nonull>1)
            newDomain->addVariable(*vi);
        }
      }
      di++;
    }
  }
  else
    PITERATE(TVarList, vi, gen->domain->attributes)
      if (   suspicious && suspicious->size() && !exists(suspicious.getReference(), *vi)      // suspicious given, this one is not among them
          || !(*vi).is_derived_from(TEnumVariable)       // cannot find the number of values
          || ((*vi).AS(TEnumVariable)->noOfValues()>1)) // has enough values
        newDomain->addVariable(*vi);

  newDomain->setClass(gen->domain->classVar);
  if (nonRedundantResult)
    *nonRedundantResult=mlnew TExampleTable(newDomain, gen);

  return newDomain;
}



TRemoveUnusedValues::TRemoveUnusedValues(bool rov)
: removeOneValued(rov)
{}


PVariable TRemoveUnusedValues::operator()(PVariable var, PExampleGenerator gen, const int &weightID)
{
  TEnumVariable *evar = var.AS(TEnumVariable);
  if (!evar)
    raiseError("'%s' is not a discrete attribute", var->name.c_str());

  TDiscDistribution dist(gen, var, weightID);
  TDiscDistribution::const_iterator dvi, dve;

  int nonull = 0;
  for(dvi = dist.begin(), dve = dist.end(); dvi!=dve; dvi++)
    if (*dvi > 1e-20)
      nonull++;

  if (!nonull || (removeOneValued && (nonull==1)))
    return PVariable();

  if (nonull==int(evar->values->size()))
    return var;

  TEnumVariable *enewVar = mlnew TEnumVariable("R_"+evar->name);
  enewVar->values = PStringList(mlnew TStringList(nonull, ""));
  PVariable newVar(enewVar);

  TClassifierByLookupTable1 *cblt = mlnew TClassifierByLookupTable1(newVar, var);
  int cnt = 0;
  TStringList::iterator vali = evar->values->begin();
  TValueList::iterator lvi(cblt->lookupTable->begin());
  TDistributionList::iterator ldi(cblt->distributions->begin());
  for(dvi = dist.begin(), dve = dist.end(); dvi!=dve; dvi++, vali++, lvi++, ldi++)
    if (*dvi > 1e-20) {
      enewVar->values->at(cnt) = *vali;
      *lvi = TValue(cnt);
      (*ldi)->addint(cnt, 1.0);
      cnt++;
    }

  newVar->getValueFrom = cblt;

  return newVar;
}


PDomain TRemoveUnusedValues::operator ()(PExampleGenerator gen, const int &weightID, bool checkClass, bool checkMetas)
{
  TVarList attributes;
  bool changed = false;
  PITERATE(TVarList, ai, gen->domain->attributes)
    if ((*ai)->varType != TValue::INTVAR)
      attributes.push_back(*ai);
    else {
      PVariable newattr = call(*ai, gen, weightID);
      if (newattr)
        attributes.push_back(newattr);
      if (newattr != *ai)
        changed = true;
    }

  
  PVariable &classVar = gen->domain->classVar;
  PVariable newClass;
  if (checkClass && classVar->varType == TValue::INTVAR) {
    newClass = call(classVar, gen, weightID);
    if (newClass != classVar)
      changed = true;
  }
  else
    newClass = classVar;


  TMetaVector metas;
  ITERATE(TMetaVector, mi, gen->domain->metas) {
    if (!checkMetas || mi->optional || (mi->variable->varType != TValue::INTVAR))
      metas.push_back(*mi);
    else if (mi->variable->noOfValues() < 2)
      changed = true;
    else {
      PVariable newattr = call(mi->variable, gen, weightID);
      if (newattr)
        attributes.push_back(newattr);
      if (newattr != mi->variable)
        changed = true;
    }
  }

  if (!changed)
    return gen->domain;

  TDomain *newDomain = new TDomain(newClass, attributes);
  newDomain->metas = metas;
  return newDomain;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -