⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 preprocessors.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 2 页
字号:
  makerind.randomGenerator = randomGenerator ? randomGenerator : mlnew TRandomGenerator;;

  // this will not assign the defaultProportion to the class
  vector<float> props(domain.attributes->size(), defaultProportion > 0.0 ? defaultProportion : 0.0);
  getProportions(proportions, domain, props);

  int idx = 0;
  vector<float>::const_iterator pi(props.begin()), pe(props.end());
  for(; pi != pe; idx++, pi++)
    if (*pi > 0.0) {
      PLongList rind = makerind(n, 1 - *pi);
      const unsigned char &varType = domain.variables->at(idx)->varType;
      int eind = 0;
      PITERATE(TLongList, ri, rind) {
        if (*ri)
          (*table)[eind][idx] = TValue(varType, specialType);
        eind++;
      }
    }

  return wtable;
}



TPreprocessor_addGaussianClassNoise::TPreprocessor_addGaussianClassNoise(const float &dev)
: deviation(dev)
{}


PExampleGenerator TPreprocessor_addGaussianClassNoise::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  PVariable classVar = gen->domain->classVar;

  if (!classVar)
    raiseError("Class-less domain");
  if (classVar->varType != TValue::FLOATVAR)
    raiseError("Class '%s' is not continuous", gen->domain->classVar->name.c_str());

  newWeight = weightID;

  if (deviation>0.0) {
    vector<pair<int, float> > deviations;
    deviations.push_back(pair<int, float>(gen->domain->attributes->size(), deviation));
    TGaussianNoiseGenerator gngen(deviations, gen, randomGenerator);
    return PExampleGenerator(mlnew TExampleTable(PExampleGenerator(gngen)));
  }

  else
    return mlnew TExampleTable(gen);
}


TPreprocessor_addMissingClasses::TPreprocessor_addMissingClasses(const float &cm, const int &st)
: proportion(cm),
  specialType(st)
{}
  
  
PExampleGenerator TPreprocessor_addMissingClasses::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  if (!gen->domain->classVar)
    raiseError("Class-less domain");

  TExampleTable *table = mlnew TExampleTable(gen);
  PExampleGenerator wtable = table;

  if (proportion>0.0) {
    TMakeRandomIndices2 mri2;
    mri2.randomGenerator = randomGenerator;
    PLongList rind(mri2(table->size(), 1-proportion));

    const TVariable &classVar = table->domain->classVar.getReference();
    const int &varType = classVar.varType;
    int eind = 0;
    PITERATE(TLongList, ri, rind) {
      if (*ri)
        (*table)[eind].setClass(TValue(varType, specialType));
      eind++;
    }
  }

  newWeight = weightID;
  return wtable;
}



TPreprocessor_addClassWeight::TPreprocessor_addClassWeight()
: classWeights(mlnew TFloatList),
  equalize(false)
{}


TPreprocessor_addClassWeight::TPreprocessor_addClassWeight(PFloatList cw, const bool &eq)
: equalize(eq),
  classWeights(cw)
{}


PExampleGenerator TPreprocessor_addClassWeight::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  if (!gen->domain->classVar || (gen->domain->classVar->varType != TValue::INTVAR))
    raiseError("Class-less domain or non-discrete class");

  TExampleTable *table = mlnew TExampleTable(gen);
  PExampleGenerator wtable = table;

  const int nocl = gen->domain->classVar->noOfValues();

  if (!equalize && !classWeights->size() || !nocl) {
    newWeight = 0;
    return wtable;
  }

  if (classWeights && classWeights->size() && (classWeights->size() != nocl))
    raiseError("size of classWeights should equal the number of classes");


  vector<float> weights;

  if (equalize) {
    PDistribution dist(getClassDistribution(gen, weightID));
    const TDiscDistribution &ddist = CAST_TO_DISCDISTRIBUTION(dist);
    if (ddist.size() > nocl)
      raiseError("there are out-of-range classes in the data (attribute descriptor has too few values)");

    if (classWeights && classWeights->size()) {
      float tot_w = 0.0;
      TFloatList::const_iterator cwi(classWeights->begin());
      TDiscDistribution::const_iterator di(ddist.begin()), de(ddist.end());
      for(; di!=de; di++, cwi++)
        if (*di > 0.0)
          tot_w += *cwi;

      if (tot_w == 0.0) {
        newWeight = 0;
        return wtable;
      }

      float fact = tot_w * ddist.abs;
      di = ddist.begin();
      PITERATE(TFloatList, wi, classWeights)
        weights.push_back(*wi / *(di++) * fact);
    }

    else { // no class weights, only equalization
      int noNullClasses = 0;
      { const_ITERATE(TDiscDistribution, di, ddist)
          if (*di>0.0)
            noNullClasses++;
      }
      const float N = ddist.abs;
      const_ITERATE(TDiscDistribution, di, ddist)
        if (*di>0.0)
          weights.push_back(N / noNullClasses / *di);
        else
          weights.push_back(1.0);
    }
  }

  else  // no equalization, only weights
    weights = classWeights.getReference();

  newWeight = getMetaID();
  PEITERATE(ei, table)
    (*ei).setMeta(newWeight, TValue(WEIGHT(*ei) * weights[(*ei).getClass().intV]));

  return wtable;
}



PDistribution kaplanMeier(PExampleGenerator gen, const int &outcomeIndex, TValue &failValue, const int &timeIndex, const int &weightID);
PDistribution bayesSurvival(PExampleGenerator gen, const int &outcomeIndex, TValue &failValue, const int &timeIndex, const int &weightID, const float &maxTime);

TPreprocessor_addCensorWeight::TPreprocessor_addCensorWeight()
: outcomeVar(),
  timeVar(),
  eventValue(),
  method(km),
  maxTime(0.0),
  addComplementary(false)
{}


TPreprocessor_addCensorWeight::TPreprocessor_addCensorWeight(PVariable ov, PVariable tv, const TValue &ev, const int &me, const float &mt)
: outcomeVar(ov),
  timeVar(tv),
  eventValue(ev),
  method(me),
  maxTime(0.0),
  addComplementary(false)
{}

void TPreprocessor_addCensorWeight::addExample(TExampleTable *table, const int &weightID, const TExample &example, const float &weight, const int &complementary, const float &compWeight)
{ 
  TExample ex = example;

  ex.setMeta(weightID, TValue(weight));
  table->addExample(ex);

  if ((complementary >= 0) && (compWeight>0.0)) {
    ex.setClass(TValue(complementary));
    ex.setMeta(weightID, TValue(compWeight));
    table->addExample(ex);
  }
}


PExampleGenerator TPreprocessor_addCensorWeight::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ 
  if (eventValue.isSpecial())
    raiseError("'eventValue' not set");

  if (eventValue.varType != TValue::INTVAR)
    raiseError("'eventValue' invalid (discrete value expected)");

  const int failIndex = eventValue.intV;

  int outcomeIndex;
  if (outcomeVar) {
    outcomeIndex = gen->domain->getVarNum(outcomeVar, false);
    if (outcomeIndex==ILLEGAL_INT)
      raiseError("outcomeVar not found in domain");
  }
  else
    if (gen->domain->classVar)
      outcomeIndex = gen->domain->attributes->size();
    else
      raiseError("'outcomeVar' not set and the domain is class-less");

  int complementary = addComplementary ? eventValue.intV : -1;

  checkProperty(timeVar);
  int timeIndex = gen->domain->getVarNum(timeVar, false);
  if (timeIndex==ILLEGAL_INT)
    raiseError("'timeVar' not found in domain");

  TExampleTable *table = mlnew TExampleTable(gen->domain);
  PExampleGenerator wtable = table;

  if (method == linear) {
    float thisMaxTime = maxTime;
    if (thisMaxTime<=0.0)
      PEITERATE(ei, table) {
        const TValue &tme = (*ei)[timeIndex];
        if (!tme.isSpecial()) {
          if (tme.varType != TValue::FLOATVAR)
            raiseError("invalid time (continuous attribute expected)");
          else
            if (tme.floatV>thisMaxTime)
              thisMaxTime = tme.floatV;
        }
      }

    if (thisMaxTime<=0.0)
      raiseError("invalid time values (max<=0)");

    newWeight = getMetaID();
    PEITERATE(ei, gen) {
      if (!(*ei)[outcomeIndex].isSpecial() && (*ei)[outcomeIndex].intV==failIndex)
        addExample(table, newWeight, *ei, WEIGHT(*ei), complementary);
      else {
        const TValue &tme = (*ei)[timeIndex];
        // need to check it again -- the above check is only run if maxTime is not given
        if (tme.varType != TValue::FLOATVAR)
          raiseError("invalid time (continuous attribute expected)");

        if (!tme.isSpecial())
          addExample(table, newWeight, *ei, WEIGHT(*ei) * (tme.floatV>thisMaxTime ? 1.0 : tme.floatV / thisMaxTime), complementary);
      }
    }
  }

  else if ((method == km) || (method == bayes)) {
    if ((km==bayes) && (maxTime<=0.0))
      raiseError("'maxTime' should be set when 'method' is 'Bayes'");
      
    PDistribution KM = (method == km) ? kaplanMeier(gen, outcomeIndex, eventValue, timeIndex, weightID)
                                      : bayesSurvival(gen, outcomeIndex, eventValue, timeIndex, weightID, maxTime);

    float KM_max = maxTime>0.0 ? KM->p(maxTime) : (*KM.AS(TContDistribution)->distribution.rbegin()).second;

    newWeight = getMetaID();
    PEITERATE(ei, gen) {
      if (!(*ei)[outcomeIndex].isSpecial() && (*ei)[outcomeIndex].intV==failIndex)
        addExample(table, newWeight, *ei, WEIGHT(*ei), -1);
      else {
        const TValue &tme = (*ei)[timeIndex];
        if (tme.varType != TValue::FLOATVAR)
          raiseError("invalid time (continuous attribute expected)");
        if (tme.varType != TValue::FLOATVAR)
          raiseError("invalid time (continuous value expected)");
        if (!tme.isSpecial()) {
          if (tme.floatV > maxTime)
            addExample(table, newWeight, *ei, WEIGHT(*ei), -1);
          else {
            float KM_t = KM->p(tme.floatV);
            if (method==km) {
              if (KM_t>0) {
                float origw = WEIGHT(*ei);
                float fact = KM_max/KM_t;
                addExample(table, newWeight, *ei, origw*fact, complementary, origw*(1-fact));
              }
            }
            else {
              float origw = WEIGHT(*ei);
              addExample(table, newWeight, *ei, origw*KM_t, complementary, origw*(1-KM_t));
            }
          }
        }
      }
    }
  }

  else
    raiseError("unknown weighting method");

  return wtable;
}
  


TPreprocessor_discretize::TPreprocessor_discretize()
: attributes(),
  discretizeClass(false),
  method()
{}


TPreprocessor_discretize::TPreprocessor_discretize(PVarList attrs, const bool &nocl, PDiscretization meth)
: attributes(attrs),
  discretizeClass(nocl),
  method(meth)
{}


PExampleGenerator TPreprocessor_discretize::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ 
  checkProperty(method);

  TVarList discretized;
  vector<int> discretizedMetas;
  TDomain *newDomain = mlnew TDomain();
  PDomain wdomain(newDomain);
  
  const TDomain &domain = gen->domain.getReference();
  
  const_PITERATE(TVarList, vi, domain.variables)
    if (   ((*vi)->varType == TValue::FLOATVAR)
        && (   !attributes || !attributes->size() 
            || exists(attributes->begin(), attributes->end(), *vi))) {
      PVariable evar = method->operator()(gen, *vi);
      newDomain->variables->push_back(evar);
      newDomain->attributes->push_back(evar);
      discretized.push_back(*vi);
    }
    else {
      newDomain->variables->push_back(*vi);
      newDomain->attributes->push_back(*vi);
    }

  if (gen->domain->classVar) {
    newDomain->classVar = newDomain->variables->back();
    newDomain->attributes->erase(newDomain->attributes->end()-1);
  }
  
  if (attributes)
    PITERATE(TVarList, ai, attributes)
      if (!exists(discretized.begin(), discretized.end(), *ai)) {
        long varNum = domain.getVarNum(*ai);
        if (varNum == ILLEGAL_INT)
          raiseError("Attribute '%s' is not found", (*ai)->name.c_str());
        else if ((varNum >= 0) || ((*ai)->varType != TValue::FLOATVAR))
          raiseError("Attribute '%s' is not continuous", (*ai)->name.c_str());
        else {
          PVariable evar = method->operator()(gen, *ai);
          TMetaDescriptor ndsc(varNum, evar);
          newDomain->metas.push_back(ndsc);
          discretizedMetas.push_back(varNum);
        }
      }

  const_ITERATE(TMetaVector, mi, domain.metas)
    if (!exists(discretizedMetas.begin(), discretizedMetas.end(), (*mi).id))
      newDomain->metas.push_back(*mi);
      
  newWeight = weightID;
  return mlnew TExampleTable(newDomain, gen);
}


TImputeClassifier::TImputeClassifier(PVariable newVar, PVariable oldVar)
: TClassifier(newVar),
  classifierFromVar(mlnew TClassifierFromVar(newVar, oldVar))
{}

TImputeClassifier::TImputeClassifier(const TImputeClassifier &old)
: TClassifier(old),
  classifierFromVar(old.classifierFromVar),
  imputer(old.imputer)
{}


TValue TImputeClassifier::operator ()(const TExample &ex)
{
  checkProperty(classifierFromVar);
  checkProperty(imputer);

  const TValue res = classifierFromVar->call(ex);

  return res.isSpecial() ? imputer->call(ex) : res;
}


PExampleGenerator TPreprocessor_imputeByLearner::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{
  checkProperty(learner);

  TDomain &domain = gen->domain.getReference();

  // determine the attributes with unknown values
  vector<int> knowns;
  for(int i = 0, e = domain.attributes->size(); i<e; i++)
    knowns.push_back(i);
  vector<int> unknowns;

  PEITERATE(ei, gen) {
    for(int rei = 1; rei--; )
      ITERATE(vector<int>, ui, knowns)
        if ((*ei)[*ui].isSpecial()) {
          unknowns.push_back(*ui);
          knowns.erase(ui);
          rei = 1;
          break; // break out of this ITERATE since the vector has changed, but set rei to 1 to enter it once again...
        }
    if (!knowns.size())
      break;
  }

  TVarList newVars = domain.attributes.getReference();
  TVarList::iterator nvi(newVars.begin());
  ITERATE(vector<int>, ki, unknowns) {
    PVariable &oldVar = domain.attributes->at(*ki);
    PVariable newVar = CLONE(TVariable, oldVar);

    TVarList learnAttrs = domain.attributes.getReference();
    learnAttrs.erase(learnAttrs.begin() + *ki);
    PDomain learnDomain = mlnew TDomain(oldVar, learnAttrs);
    PExampleGenerator data = mlnew TExampleTable(learnDomain, gen);

    TImputeClassifier *imputeClassifier = mlnew TImputeClassifier(newVar, oldVar);
    PClassifier wimputeClassifier = imputeClassifier;

    imputeClassifier->imputer = learner->call(data, weightID);

    newVar->getValueFrom = wimputeClassifier;

    newVars[*ki] = newVar;
  }

  newWeight = weightID;
  PDomain newDomain = mlnew TDomain(domain.classVar, newVars);
  return mlnew TExampleTable(newDomain, gen);
}



TPreprocessor_filter::TPreprocessor_filter(PFilter filt)
: filter(filt)
{}

PExampleGenerator TPreprocessor_filter::operator()(PExampleGenerator gen, const int &weightID, int &newWeight)
{ checkProperty(filter);
  newWeight = weightID;
  return filterExamples(filter, gen);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -