📄 measures.cpp

📁 orange源码数据挖掘技术
💻 CPP
📖 第 1 页 / 共 4 页
字号:
    const TDiscDistribution &dist = CAST_TO_DISCDISTRIBUTION(*ci);
    if (ci == mostCommon) {
      TDiscDistribution dist2 = dist;
      dist2 += probabilities->innerDistributionUnknown;
      N += dist2.cases;
      continueCost += dist2.cases * majorityCost(dist2);
    }
    else {
      N += dist.cases;
      continueCost += dist.cases * majorityCost(dist.distribution);
    }
  }

  if (unknownsTreatment == UnknownsAsValue) {
    const float &cases = probabilities->innerDistributionUnknown->cases;
    N += cases;
    continueCost += cases * majorityCost(CAST_TO_DISCDISTRIBUTION(probabilities->innerDistributionUnknown));
  }

  continueCost /= N;

  float cost = stopCost - continueCost;
  if (unknownsTreatment == ReduceByUnknowns)
    cost *= (outer.cases / (outer.unknowns + outer.cases));

  return round0(cost);
}



TMeasureAttribute_MSE::TMeasureAttribute_MSE(const int &unkTreat)
: TMeasureAttribute(Contingency_Class, false, true),
  m(0),
  unknownsTreatment(unkTreat)
{}


float TMeasureAttribute_MSE::operator()(PContingency cont, PDistribution classDistribution, PDistribution apriorClass)
{
  checkDiscreteContinuous(cont, "MeasureAttribute_MSE");

  const TDistribution &outer = CAST_TO_DISCDISTRIBUTION(cont->outerDistribution);
  
  if (cont->innerVariable->varType!=TValue::FLOATVAR)
    raiseError("cannot evaluate attribute in domain with discrete classes");
  if (cont->outerVariable->varType!=TValue::INTVAR)
    raiseError("cannot evaluate continuous attributes");

  const TContDistribution &classDist = CAST_TO_CONTDISTRIBUTION(classDistribution);

  float W=classDist.abs;
  if (W<=0)
    return 0.0;

  float I_orig=(classDist.sum2-classDist.sum*classDist.sum/W)/W;
  if (I_orig<=0.0)
    return 0.0;

  TDistributionVector::const_iterator mostCommon = (unknownsTreatment == UnknownsToCommon)
    ? cont->discrete->begin() + outer.highestProbIntIndex()
    : cont->discrete->end();

  float I=0;
  float downW=0;
  const_ITERATE(TDistributionVector, ci, *cont->discrete) {
    const TContDistribution &tdist = CAST_TO_CONTDISTRIBUTION(*ci);
    if (ci==mostCommon) {
      const float ssum2 = tdist.sum2 + cont->innerDistribution.AS(TContDistribution)->sum2;
      const float ssum = tdist.sum + cont->innerDistribution.AS(TContDistribution)->sum;
      const float sabs = tdist.abs + cont->innerDistribution.AS(TContDistribution)->abs;
      I += ssum2  -  ssum*ssum / sabs;
      downW += sabs;
    }
    else {
      if (tdist.abs>0) {
        I += tdist.sum2 - tdist.sum*tdist.sum/tdist.abs;
        downW += tdist.abs;
      }
    }
  }

  if (unknownsTreatment == UnknownsAsValue) {
    const TContDistribution &tdist = CAST_TO_CONTDISTRIBUTION(cont->innerDistributionUnknown);
    I += tdist.sum2 - tdist.sum*tdist.sum/tdist.abs;
    downW += tdist.abs;
  }

  if (apriorClass && (m>0)) {
    const TContDistribution &tdist = CAST_TO_CONTDISTRIBUTION(apriorClass);
    I =   (I + m * (tdist.sum2 - tdist.sum * tdist.sum/tdist.abs) / tdist.abs)
        / (downW + m);
  }
  else 
    I /= downW;

  float mse = (I_orig - I)/I_orig;
  if (unknownsTreatment == ReduceByUnknowns)
    mse *= (outer.cases / (outer.unknowns + outer.cases));
  
  return round0(mse);
}



TMeasureAttribute_relief::TMeasureAttribute_relief(int ak, int am)
: TMeasureAttribute(Generator, true, false), 
  k(ak),
  m(am),
  prevExamples(-1),
  prevWeight(0)
{}




inline bool compare2nd(const pair<int, float> &o1, const pair<int, float> &o2)
{ return o1.second < o2.second; }


void TMeasureAttribute_relief::prepareNeighbours(PExampleGenerator gen, const int &weightID)
{
  neighbourhood.clear();

  if (!gen->domain->classVar)
    raiseError("classless domain");

  const bool regression = gen->domain->classVar->varType == TValue::FLOATVAR;

  if (!regression && (gen->domain->classVar->varType != TValue::INTVAR))
    raiseError("cannot compute ReliefF of a class that is neither discrete nor continuous");
  
  storedExamples = mlnew TExampleTable(gen->domain, !gen.is_derived_from(TExampleTable));
  TExampleTable &table = dynamic_cast<TExampleTable &>(storedExamples.getReference());
  PEITERATE(ei, gen)
    if (!(*ei).getClass().isSpecial())
      table.addExample(*ei);

  const int N = table.numberOfExamples();
  if (!N)
    raiseError("no examples with known class");

  const int classIdx = table.domain->attributes->size();

  vector<vector<int> > examplesByClasses(regression ? 1 : table.domain->classVar->noOfValues());
  vector<vector<int > >::iterator ebcb, ebci, ebce;

  float minCl, maxCl;

  if (table.domain->classVar->varType==TValue::INTVAR) {
    int index;
    TExampleIterator ei;

    for(ei = table.begin(), index = 0; ei; ++ei, index++)
      examplesByClasses.at(int((*ei).getClass())).push_back(index);

    for(ebcb = examplesByClasses.begin(), ebci = ebcb, ebce = examplesByClasses.end(); ebci != ebce; ) {
      const int sze = (*ebci).size();
      if (sze)
        ebci++;
      else {
        examplesByClasses.erase(ebci);
        ebce = examplesByClasses.end();
      }
    }
  }
  else {
    ebcb = examplesByClasses.begin(), ebce = examplesByClasses.end();
    ebcb->resize(N);
    int i = 0;
    for(vector<int>::iterator c0i(ebcb->begin()), c0e(ebcb->end()); c0i != c0e; *c0i++ = i++);

    TExampleIterator ei(table.begin());
    minCl = maxCl = (*ei).getClass().floatV;
    while(++ei) {
      const float tex = (*ei).getClass().floatV;
	    if (tex > maxCl)
        maxCl = tex;
		  else if (tex < minCl)
        minCl = tex;
		}
  }


  distance = TExamplesDistanceConstructor_Relief()(gen);
  const TExamplesDistance_Relief &rdistance = dynamic_cast<const TExamplesDistance_Relief &>(distance.getReference());

  TRandomGenerator rgen(N);
  int referenceIndex = 0;
  const bool useAll = (m==-1) || (!weightID && (m>N));
  float referenceExamples, referenceWeight;

  for(referenceExamples = 0; useAll ? (referenceIndex < N) : (referenceExamples < m); referenceExamples += referenceWeight, referenceIndex++) {
    if (!useAll)
      referenceIndex = rgen.randlong(N);
    TExample &referenceExample = table[referenceIndex];
    referenceWeight = WEIGHT(referenceExample);

    const TValue &referenceValue = referenceExample.getClass();
    const int referenceClass= regression ? 0 : referenceExample.getClass().intV;

    neighbourhood.push_back(referenceIndex);
    vector<TNeighbourExample> &refNeighbours = neighbourhood.back().neighbours;

    ndC = 0.0;

    ITERATE(vector<vector<int> >, cli, examplesByClasses) {
      const float inCliClass = (*cli).size();
      const float classReferenceWeight =
         regression ? referenceWeight 
                    : referenceWeight * (referenceExample.getClass().intV == table[cli->front()].getClass().intV ? -1.0 : float(inCliClass) / float(N-inCliClass));

      vector<pair<int, float> > distances(inCliClass);
      vector<pair<int, float> >::iterator disti = distances.begin(), diste;
      ITERATE(vector<int> , clii, *cli)
        *disti++ = make_pair(*clii, rdistance(referenceExample, table[*clii]));

      diste = distances.end();
      disti = distances.begin();
      sort(disti, diste, compare2nd);

      int startNew = refNeighbours.size();

      while(disti != diste && (disti->second <= 0))
        disti++;

      float inWeight, needwei;
      for(needwei = k; (disti != diste) && (needwei > 1e-6); ) {
        const float thisDist = disti->second;
        inWeight = 0.0;
        const int inAdded = refNeighbours.size();
        do {
          TExample &neighbourExample = table[disti->first];

          const float neighbourWeight = WEIGHT(neighbourExample);
          const float weightEE = neighbourWeight * referenceWeight;
          inWeight += neighbourWeight;

          if (regression) {
            const float classDist = rdistance(classIdx, neighbourExample.getClass(), referenceValue);
            refNeighbours.push_back(TNeighbourExample(disti->first,
                                                      weightEE * classDist,
                                                      weightEE));
            ndC += weightEE * classDist;
          }
          else
            refNeighbours.push_back(TNeighbourExample(disti->first, weightEE * (neighbourExample.getClass().intV == referenceClass ? -1 : 1)));
        } while ((++disti != diste) && (disti->second == thisDist));

        needwei -= inWeight;
      }

      if (k-needwei > 1) {
        const float adj = 1.0 / (k - needwei);
        if (regression)
          for(vector<TNeighbourExample>::iterator ai(refNeighbours.begin() + startNew), ae(refNeighbours.end()); ai != ae; ai++) {
            ai->weight *= adj;
            ai->weightEE *= adj;
          }
        else
          for(vector<TNeighbourExample>::iterator ai(refNeighbours.begin() + startNew), ae(refNeighbours.end()); ai != ae; ai++)
            ai->weight *= adj;
      }
    }
  }


  if (regression)
    m_ndC = referenceExamples - ndC;
  else
    ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
      const float adj = 1.0 / referenceExamples;
      ITERATE(vector<TNeighbourExample>, nei, rei->neighbours)
        nei->weight *= adj;
    }
}


void TMeasureAttribute_relief::checkNeighbourhood(PExampleGenerator gen, const int &weightID)
{
  if (!gen->domain->classVar)
    raiseError("class-less domain");

  if ((prevExamples != gen->version) || (weightID != prevWeight))  {
    measures.clear();
    prepareNeighbours(gen, weightID);
    prevExamples = gen->version;
    prevWeight = weightID;
  }
}


float *tabulateContinuousValues(PExampleGenerator gen, const int &weightID, TVariable &variable,
                                float &min, float &max, float &avg, float &N)
{
  float *pc, *precals;
  precals = pc = new float[gen->numberOfExamples()];
  avg = N = 0.0;

  PEITERATE(ei, gen) {
    const TValue &val = variable.computeValue(*ei);
    if (val.isSpecial())
      *pc++ = ILLEGAL_FLOAT;
    else {
      *pc++ = val.floatV;
      if (N == 0.0)
        max = min = val.floatV;
      else if (val.floatV > max)
        max = val.floatV;
      else if (val.floatV < min)
        min = val.floatV;

      const float w = WEIGHT(*ei);
      avg += w * val.floatV;
      N += w;
    }
  }

  if (N > 1e-6)
    avg /= N;

  return precals;
}


int *tabulateDiscreteValues(PExampleGenerator gen, const int &weightID, TVariable &variable,
                            float *&unk, float &bothUnk)
{
  const int noVal = dynamic_cast<TEnumVariable &>(variable).noOfValues();

  int *pc, *precals = pc = new int[gen->numberOfExamples()];
  unk = new float[noVal];

  try {
    float *ui, *ue = unk + noVal;
    for(ui = unk; ui != ue; *ui++ = 0.0);
       
    int *pc = precals;
    PEITERATE(ei, gen) {
      const TValue &val = variable.computeValue(*ei);
      if (val.isSpecial() || (val.intV >= noVal) || (val.intV < 0))
        *pc++ = ILLEGAL_INT;
      else {
        *pc++ = val.intV;
        unk[val.intV] += WEIGHT(*ei);
      }
    }

    bothUnk = 1.0;
    for(ui = unk; ui != ue; ui++) {
      bothUnk -= *ui * *ui;
      *ui = 1 - *ui;
    }
  }
  catch (...) {
    delete unk;
    unk = NULL;
    delete precals;
    precals = NULL;
    throw;
  }

  return precals;
}


float TMeasureAttribute_relief::operator()(PVariable var, PExampleGenerator gen, PDistribution aprior, int weightID)
{
  checkNeighbourhood(gen, weightID);

  // the attribute is in the domain
  const int attrIdx = gen->domain->getVarNum(var, false);
  if (attrIdx != ILLEGAL_INT) {
    if (measures.empty()) {
      const TExamplesDistance_Relief &rdistance = dynamic_cast<const TExamplesDistance_Relief &>(distance.getReference());

      const TExampleTable &table = dynamic_cast<const TExampleTable &>(gen.getReference());
      const int nAttrs = gen->domain->attributes->size();
      measures = vector<float>(nAttrs, 0.0);
      vector<float>::iterator mb(measures.begin()), mi;
      const vector<float>::const_iterator me(measures.end());
      TExample::const_iterator e1i, e1b, e2i;
      int attrNo;

      if (gen->domain->classVar->varType == TValue::FLOATVAR) {
        vector<float> ndA(nAttrs, 0.0);
        vector<float> ndCdA(nAttrs, 0.0);
        vector<float>::iterator ndAb(ndA.begin()), ndAi;
        const vector<float>::const_iterator ndAe(ndA.end());
        vector<float>::iterator ndCdAb(ndCdA.begin()), ndCdAi;

        ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
          const TExample &referenceExample = table[rei->index];
          e1b = referenceExample.begin();
          ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
            const float &weight = nei->weight;
            const float &weightEE = nei->weightEE;
            for(attrNo = 0, e1i = e1b, e2i = table[nei->index].begin(), ndAi = ndAb, ndCdAi = ndCdAb; ndAi != ndAe; ndAi++, ndCdAi++, e1i++, e2i++, attrNo++) {
              const float attrDist = rdistance(attrNo, *e1i, *e2i);
              *ndAi += weightEE * attrDist;
              *ndCdAi += weight * attrDist;
            }
          }
        }
        for(ndAi = ndAb, ndCdAi = ndCdAb, mi = mb; mi != me; mi++, ndAi++, ndCdAi++)
          *mi = *ndCdAi / ndC - (*ndAi - *ndCdAi) / m_ndC;
      }
      else {
        ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
          const TExample &referenceExample = table[rei->index];
          e1b = referenceExample.begin();
          ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
            const float &weight = nei->weight;
            for(attrNo = 0, e1i = e1b, e2i = table[nei->index].begin(), mi = mb; mi != me; e1i++, e2i++, mi++, attrNo++)
              *mi += weight * rdistance(attrNo, *e1i, *e2i);
          }
        }
      }
    }

    return measures[attrIdx];
  }


  // the attribute is not in the domain
  else {
    if (!var->getValueFrom)
      raiseError("attribute is not among the domain attributes and cannot be computed from them");
  
    const TExampleTable &table = dynamic_cast<const TExampleTable &>(gen.getReference());
    TVariable &variable = var.getReference();
    const int nExamples = gen->numberOfExamples();

    PExamplesDistance distance;


    // continuous attribute
    if (variable.varType == TValue::FLOATVAR) {
      float avg, min, max, N;
      float *precals = tabulateContinuousValues(gen, weightID, variable, min, max, avg, N);

      try {
        if ((min == max) || (N < 1e-6)) {
          delete precals;
          return 0.0;
        }

        const float nor = 1.0 / (min-max);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -