📄 measures.cpp

📁 orange源码数据挖掘技术
💻 CPP
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
        // continuous attribute, continuous class
        if (gen->domain->classVar->varType == TValue::FLOATVAR) {
          float ndA = 0.0, ndCdA = 0.0;
          ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
            const float refVal = precals[rei->index];
            if (refVal == ILLEGAL_FLOAT)
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const float neiVal = precals[nei->index];
                const float attrDist = (neiVal == ILLEGAL_FLOAT) ? 0.5 : fabs(avg - neiVal) * nor;
                ndA += nei->weightEE * attrDist;
                ndCdA += nei->weight * attrDist;
              }
            else {
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const float neiVal = precals[nei->index];
                const float attrDist = fabs(refVal - (neiVal == ILLEGAL_FLOAT ? avg : neiVal)) * nor;
                ndA += nei->weightEE * attrDist;
                ndCdA += nei->weight * attrDist;
              }
            }
          }

          delete precals;
          return ndCdA / ndC - (ndA - ndCdA) / m_ndC;
        }

        // continuous attribute, discrete class
        else {
          float relf = 0.0;

          ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
            const float refVal = precals[rei->index];
            if (refVal == ILLEGAL_FLOAT)
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const float neiVal = precals[nei->index];
                const float attrDist = (neiVal == ILLEGAL_FLOAT) ? 0.5 : fabs(avg - neiVal) * nor;
                relf += nei->weight * attrDist;
              }
            else {
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const float neiVal = precals[nei->index];
                const float attrDist = fabs(refVal - (neiVal == ILLEGAL_FLOAT ? avg : neiVal)) * nor;
                relf += nei->weight * attrDist;
              }
            }
          }

          delete precals;
          return relf;
        }

      }
      catch (...) {
        delete precals;
        throw;
      }
    }


    // discrete attribute
    else {
      float *unk, bothUnk;
      int *precals = tabulateDiscreteValues(gen, weightID, var.getReference(), unk, bothUnk);

      try {
        // discrete attribute, continuous class
        if (gen->domain->classVar->varType == TValue::FLOATVAR) {
          float ndA = 0.0, ndCdA = 0.0;
          ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
            const int refVal = precals[rei->index];
            if (refVal == ILLEGAL_INT)
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const int neiVal = precals[nei->index];
                const float attrDist = (neiVal == ILLEGAL_INT) ? bothUnk : unk[neiVal];
                ndA += nei->weightEE * attrDist;
                ndCdA += nei->weight * attrDist;
              }
            else {
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const int neiVal = precals[nei->index];
                const float attrDist = (neiVal == ILLEGAL_INT) ? unk[refVal] : (refVal != neiVal ? 1.0 : 0.0);
                ndA += nei->weightEE * attrDist;
                ndCdA += nei->weight * attrDist;
              }
            }
          }

          delete unk;
          delete precals;
          return ndCdA / ndC - (ndA - ndCdA) / m_ndC;
        }

        // discrete attribute, discrete class
        else {
          float relf = 0.0;

          ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
            const int refVal = precals[rei->index];
            if (refVal == ILLEGAL_FLOAT)
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const int neiVal = precals[nei->index];
                relf += nei->weight * ((neiVal == ILLEGAL_INT) ? bothUnk : unk[neiVal]);
              }
            else {
              ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
                const int neiVal = precals[nei->index];
                relf += nei->weight * ((neiVal == ILLEGAL_INT) ? unk[refVal] : (refVal != neiVal ? 1.0 : 0.0));
              }
            }
          }

          delete unk;
          delete precals;
          return relf;
        }
      }
      catch (...) {
        delete unk;
        delete precals;
        throw;
      }
    }
  }
}



void TMeasureAttribute_relief::thresholdFunction(TFloatFloatList &res, PVariable var, PExampleGenerator gen, PDistribution, int weightID)
{
  TFunctionAdder divs;
  thresholdFunction(var, gen, divs, weightID);

  res.clear();
  float score = 0;
  for(TFunctionAdder::const_iterator di(divs.begin()), de(divs.end()); di != de; di++)
    res.push_back(make_pair(di->first, score += di->second));
}


float TMeasureAttribute_relief::bestThreshold(PDistribution &subsetSizes, float &bestScore, PVariable var, PExampleGenerator gen, PDistribution, int weightID, const float &minSubset)
{
  TFunctionAdder divs;
  int wins = 0;
  float score = 0.0, bestThreshold;
  TRandomGenerator rgen(gen->numberOfExamples());

  if (minSubset > 0) {
    float *attrVals;
    thresholdFunction(var, gen, divs, weightID, &attrVals);

    TContDistribution *valueDistribution;
    PDistribution wvd;

    if (attrVals) {
      try {
        float *vali = attrVals, *vale;
        wvd = valueDistribution = new TContDistribution(var);
        if (weightID)
          for(TExampleIterator ei(gen->begin()); ei; ++ei, vali++)
            if (*vali != ILLEGAL_FLOAT)
              valueDistribution->addfloat(*vali, WEIGHT(*ei));
        else
           for(vali = attrVals, vale = attrVals + gen->numberOfExamples(); vali != vale; vali++)
             if (*vali != ILLEGAL_FLOAT)
               valueDistribution->addfloat(*vali);
      }
      catch (...) {
        delete attrVals;
        throw;
      }

      delete attrVals;
      attrVals = NULL;
    }
    else {
      wvd = new TContDistribution(gen, var, weightID);
      valueDistribution = wvd.AS(TContDistribution);
    }

    float left = 0.0, right = valueDistribution->abs;
    float bestLeft, bestRight;

    map<float, float>::iterator distb(valueDistribution->begin()), diste(valueDistribution->end()), disti = distb, disti2;
    for(TFunctionAdder::const_iterator di(divs.begin()), de(divs.end()); di != de; di++) {
      score += di->second;
      if (!wins || (score > bestScore) || (score == bestScore) && rgen.randbool(++wins)) {
        for(; (disti != diste) && (disti->first <= di->first); disti++) {
          left += disti->second;
          right -= disti->second;
        }
        if ((left < minSubset))
          continue;
        if ((right < minSubset) || (disti == diste))
          break;
  
        if (!wins || (score > bestScore))
          wins = 1;
  
        bestScore = score;
        bestLeft = left;
        bestRight = right;

        // disti cannot be distb (contemplate the above for)
        disti2 = disti;
        bestThreshold = (disti->first + (--disti2)->first) / 2.0;
      }
    }

    if (!wins) {
      subsetSizes = NULL;
      return ILLEGAL_FLOAT;
    }

    subsetSizes = new TDiscDistribution(2);
    subsetSizes->addint(0, bestLeft);
    subsetSizes->addint(1, bestRight);
    return bestThreshold;
  }

  else {
    thresholdFunction(var, gen, divs, weightID);

    for(TFunctionAdder::const_iterator db(divs.begin()), de(divs.end()), di = db, di2; di != de; di++) {
      score += di->second;
      if (   (!wins || (score > bestScore)) && ((wins=1) == 1)
          || (score == bestScore) && rgen.randbool(++wins)) {
        di2 = di;
        bestThreshold = (++di2 == de) && (--di2 == db) ? di->first : (di->first + di2->first) / 2.0;
        bestScore = score;
      }
    }

    subsetSizes = NULL;
    return wins ? bestThreshold : ILLEGAL_FLOAT;
  }
}


PSymMatrix TMeasureAttribute_relief::gainMatrix(PVariable var, PExampleGenerator gen, PDistribution, int weightID, int **attrVals, float **attrDistr)
{
  TEnumVariable *evar = var.AS(TEnumVariable);
  if (!evar)
    raiseError("thresholdFunction can only be computed for continuous attributes");

  checkNeighbourhood(gen, weightID);

  TSymMatrix *gains = new TSymMatrix(evar->noOfValues());
  PSymMatrix wgains = gains;

  const int attrIdx = gen->domain->getVarNum(var, false);
  const bool regression = gen->domain->classVar->varType == TValue::FLOATVAR;

  if (attrIdx != ILLEGAL_INT) {
    if (attrVals)
      *attrVals = NULL;
    if (attrDistr)
      *attrDistr = NULL;

    const TExamplesDistance_Relief &rdistance = dynamic_cast<const TExamplesDistance_Relief &>(distance.getReference());
    const TExampleTable &table = dynamic_cast<const TExampleTable &>(gen.getReference());

    ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
      const TValue &refVal = table[rei->index][attrIdx];
      if (refVal.isSpecial())
        continue;
      const int &refValI = refVal.intV;

      ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
        const TValue &neiVal = table[nei->index][attrIdx];
        if (neiVal.isSpecial())
          continue;

        const float attrDist = rdistance(attrIdx, refVal, neiVal);
        if (regression) {
          const float dCdA = nei->weight * attrDist;
          const float dA = nei->weightEE * attrDist;
          gains->getref(refValI, neiVal.intV) += dCdA / ndC - (dA - dCdA) / m_ndC;
        }
        else
          gains->getref(refValI, neiVal.intV) += nei->weight * attrDist;
      }
    }
  }

  else {
    if (!var->getValueFrom)
      raiseError("attribute is not among the domain attributes and cannot be computed from them");

    float *unk, bothUnk;
    int *precals = tabulateDiscreteValues(gen, weightID, var.getReference(), unk, bothUnk);
    if (attrVals)
      *attrVals = precals;
    if (attrDistr) {
      const int noVal = evar->noOfValues();
      *attrDistr = new float[noVal];
      for(float *ai = *attrDistr, *ui = unk, *ue = unk + noVal; ui != ue; *ai++ = 1 - *ui++);
    }

    try {
      ITERATE(vector<TReferenceExample>, rei, neighbourhood) {
        const int refValI = precals[rei->index];
        ITERATE(vector<TNeighbourExample>, nei, rei->neighbours) {
          const int neiVal = precals[nei->index];
          const int attrDist = (refValI == ILLEGAL_INT) ? ((neiVal == ILLEGAL_INT) ? bothUnk : unk[neiVal])
                                                        : ((neiVal == ILLEGAL_INT) ? unk[refValI] : (refValI != neiVal ? 1.0 : 0.0));
          if (attrDist == 0.0)
            continue;
          if (regression) {
            const float dCdA = nei->weight * attrDist;
            const float dA = nei->weightEE * attrDist;
            gains->getref(refValI, neiVal) += dCdA / ndC - (dA - dCdA) / m_ndC;
          }
          else
            gains->getref(refValI, neiVal) += nei->weight * attrDist;
        }
      }

      delete unk;
      if (!attrVals)
        delete precals;
    }
    catch (...) {
      if (unk)
        delete unk;
      if (precals)
        delete precals;
      throw;
    }
  }

  return wgains;
}


PIntList TMeasureAttribute_relief::bestBinarization(PDistribution &subsetSizes, float &bestScore, PVariable var, PExampleGenerator gen, PDistribution apriorClass, int weightID, const float &minSubset)
{
  TEnumVariable *evar = var.AS(TEnumVariable);
  if (!evar)
    raiseError("cannot discretly binarize a continuous attribute");

  const int noVal = evar->noOfValues();
  if (noVal > 16)
    raiseError("cannot binarize an attribute with more than 16 values (it would take too long)");

  float *attrDistr = NULL;
  PSymMatrix wgain = gainMatrix(var, gen, apriorClass, weightID, NULL, &attrDistr);
  TSymMatrix &gain = wgain.getReference();

  float *gains = new float[noVal * noVal], *gi = gains, *ge;

  int wins = 0, bestSubset;
  float bestLeft, bestRight;

  try {
    float thisScore = 0.0;
    int i, j;
    for(i = 0; i < noVal; i++)
      for(j = 0; j < noVal; j++)
        *gi++ = gain.getitem(i, j);

    float thisLeft = 0.0, thisRight = 0.0;
    float *ai, *ae;
    if (!attrDistr) {
      TDiscDistribution dd(gen, var, weightID);
      attrDistr = new float[noVal];
      ai = attrDistr;
      ae = attrDistr + noVal;
      for(vector<float>::const_iterator di(dd.distribution.begin()); ai != ae; thisLeft += (*ai++ = *di++));
    }
    else
      for(ai = attrDistr, ae = attrDistr + noVal; ai != ae; thisLeft += *ai++);

    if (thisLeft < minSubset)
      return NULL;

    bestSubset = 0;
    wins = 0;
    bestLeft = thisLeft;
    bestRight = 0.0;
    bestScore = 0;

    TRandomGenerator rgen(gen->numberOfExamples());

    // if a bit in gray is 0, the corresponding value is on the left
    for(int cnt = (1 << (noVal-1)) - 1, gray = 0; cnt; cnt--) {
      int prevgray = gray;
      gray = cnt ^ (cnt >> 1);
      int graydiff = gray ^ prevgray;
      int diffed;
      for(diffed = 0; !(graydiff & 1); graydiff >>= 1, diffed++);

      if (gray > prevgray) { // something went to the right; subtract all the gains for being different from values on the right
        /* prevgray = gray; */   //  unneeded: they only differ in the bit representing this group
        for(gi = gains + diffed*noVal, ge = gi + noVal; gi != ge; thisScore += prevgray & 1 ? -*gi++ : *gi++, prevgray >>= 1);
        thisLeft -= attrDistr[diffed];
        thisRight += attrDistr[diffed];
      }
      else {
        /* prevgray = gray; */   //  unneeded: they only differ in the bit representing this group
        for(gi = gains + diffed*noVal, ge = gi + noVal; gi != ge; thisScore += prevgray & 1 ? *gi++ : +*gi++, prevgray >>= 1);
        thisLeft += attrDistr[diffed];
        thisRight -= attrDistr[diffed];
      }

      if (   (thisLeft >= minSubset) && (thisRight >= minSubset)
          && (   (!wins || (thisScore > bestScore)) && ((wins=1) == 1)
              || (thisScore == bestScore) && rgen.randbool(++wins))) {
        bestScore = thisScore;
        bestSubset = gray;
        bestLeft = thisLeft;
        bestRight = thisRight;
      }
    }

    delete gains;
    gains = NULL;

    if (!wins || !bestSubset) {
      delete attrDistr;
      return false;
    }
    
    ai = attrDistr;
    TIntList *rightSide = new TIntList();
    for(i = noVal; i--; bestSubset = bestSubset >> 1, ai++)
      rightSide->push_back(*ai > 0 ? bestSubset & 1 : -1);

    delete attrDistr;
    attrDistr = NULL;

    subsetSizes = new TDiscDistribution(2);
    subsetSizes->addint(0, bestLeft);
    subsetSizes->addint(1, bestRight);

    return rightSide;
  }
  catch (...) {
    if (gains)
      delete gains;
    if (attrDistr)
      delete attrDistr;
    throw;
  }
}
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -