📄 estimateprob.cpp

📁 orange源码数据挖掘技术
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
      *result += y1;
      *result *= 0.5;
      return wresult;
    }

    // The normal formula for this is in the function above
    *result -= y1;
    *result *= (x-x1)/(x2-x1);
    *result += y1;
    return wresult;
  }

  raiseError("invalid attribute value for condition");
  return PDistribution();
}


PContingency TConditionalProbabilityEstimator_FromDistribution::operator()() const
{ return CLONE(TContingency, probabilities); }

  

void TConditionalProbabilityEstimator_ByRows::checkCondition(const TValue &condition) const
{ checkProperty(estimatorList);
  if (!estimatorList->size())
    raiseError("empty 'estimatorList'");
  if (condition.isSpecial())
    raiseError("undefined attribute value for condition");
  if (condition.varType != TValue::INTVAR)
    raiseError("value for condition is not discrete");
  if (condition.intV >= estimatorList->size())
    raiseError("value for condition out of range");
}


float TConditionalProbabilityEstimator_ByRows::operator()(const TValue &val, const TValue &condition) const
{ checkCondition(condition);
  return estimatorList->operator[](condition.intV)->call(val);
}


PDistribution TConditionalProbabilityEstimator_ByRows::operator()(const TValue &condition) const
{ checkCondition(condition);
  return estimatorList->operator[](condition.intV)->call();
}


TConditionalProbabilityEstimatorConstructor_ByRows::TConditionalProbabilityEstimatorConstructor_ByRows(PProbabilityEstimatorConstructor pec)
: estimatorConstructor(pec)
{}

PConditionalProbabilityEstimator TConditionalProbabilityEstimatorConstructor_ByRows::operator()(PContingency frequencies, PDistribution apriori, PExampleGenerator gen, const long &weightID, const int &attrNo) const
{ if (!frequencies)
    frequencies = mlnew TContingencyAttrClass(gen, weightID, attrNo);
  if (frequencies->varType != TValue::INTVAR)
    if (frequencies->outerVariable)
      raiseError("attribute '%s' is not discrete", frequencies->outerVariable->name.c_str());
    else
      raiseError("discrete attribute for condition expected");

  /* We first try to construct a list of Distributions; if we suceed, we'll return an instance of
     TConditionProbabilityEstimator_FromDistribution. If we fail, we'll construct an instance of
     TConditionProbabilityEstimator_ByRows. */

  // This list stores conditional estimators for the case we fail
  PProbabilityEstimatorList cpel = mlnew TProbabilityEstimatorList();

  PContingency newcont = mlnew TContingencyAttrClass(frequencies->outerVariable, frequencies->innerVariable);
  TDistributionVector::const_iterator fi(frequencies->discrete->begin()), fe(frequencies->discrete->end());
  for (int i = 0; fi!=fe; fi++, i++) {
    PProbabilityEstimator est = estimatorConstructor->call(*fi, apriori, PExampleGenerator(), 0, attrNo);
    cpel->push_back(est);
    PDistribution dist = est->call();
    if (!dist)
      break;
    if (i >= newcont->discrete->size())
      newcont->discrete->push_back(dist);
    else
      newcont->discrete->operator[](i) = dist;
  }
  

  if (fi==fe)
    return mlnew TConditionalProbabilityEstimator_FromDistribution(newcont);

  /* We failed at constructing a matrix of probabilites. We'll just complete the list of estimators. */
  for (; fi!=fe; fi++)
    cpel->push_back(estimatorConstructor->call(*fi, apriori, gen, weightID));

  TConditionalProbabilityEstimator_ByRows *cbr = mlnew TConditionalProbabilityEstimator_ByRows();
  PConditionalProbabilityEstimator wcbr = cbr;
  cbr->estimatorList = cpel;
  return wcbr;
}



TConditionalProbabilityEstimatorConstructor_loess::TConditionalProbabilityEstimatorConstructor_loess(const float &windowProp, const int &ak)
: windowProportion(windowProp),
  nPoints(ak),
  distributionMethod(DISTRIBUTE_MINIMAL)
{}


PConditionalProbabilityEstimator TConditionalProbabilityEstimatorConstructor_loess::operator()(PContingency frequencies, PDistribution, PExampleGenerator, const long &, const int &) const
{ if (frequencies->varType != TValue::FLOATVAR)
    if (frequencies->outerVariable)
      raiseError("attribute '%s' is not continuous", frequencies->outerVariable->name.c_str());
    else
      raiseError("continuous attribute expected for condition");
  if (!frequencies->continuous->size())
    raiseError("empty distribution");

  PContingency cont = CLONE(TContingency, frequencies);

  const TDistributionMap &points = *frequencies->continuous;
  vector<float> xpoints;
  distributePoints(points, nPoints, xpoints, distributionMethod);

  if (!points.size())
    raiseError("no points for the curve (check 'nPoints')");

  TDistributionMap::const_iterator lowedge = points.begin();
  TDistributionMap::const_iterator highedge = points.end();

  bool needAll;
  map<float, PDistribution>::const_iterator from, to;

  vector<float>::const_iterator pi(xpoints.begin()), pe(xpoints.end());
  float refx = *pi;

  from = lowedge;
  to = highedge; 
  int totalNumOfPoints = frequencies->outerDistribution->abs;

  int needpoints = int(ceil(totalNumOfPoints * windowProportion));
  if (needpoints<3)
    needpoints = 3;


  TSimpleRandomGenerator rgen(frequencies->outerDistribution->cases);

  if ((needpoints<=0) || (needpoints>=totalNumOfPoints)) {  //points.size()
    needAll = true;
    from = lowedge;
    to = highedge;
  }
  else {
    needAll = false;

    /* Find the window */
    from = points.lower_bound(refx);
    to = points.upper_bound(refx);
    if (from==to)
      if (to != highedge)
        to++;
      else
        from --;

    /* Extend the interval; we set from to highedge when it would go beyond lowedge, to indicate that only to can be modified now */
    while (needpoints) {
      if ((to == highedge) || ((from != highedge) && (refx - (*from).first < (*to).first - refx))) {
        if (from == lowedge)
          from = highedge;
        else {
          from--;
          needpoints -= (*from).second->cases;
        }
      }
      else {
        to++;
        if (to!=highedge)
          needpoints -= (*to).second->cases;
        else
          needpoints = 0;
      }

    }
    
    if (from == highedge)
      from = lowedge;
    else
      from++;
  }

  int numOfOverflowing = 0;
  // This follows http://www-2.cs.cmu.edu/afs/cs/project/jair/pub/volume4/cohn96a-html/node7.html
  for(;;) {
    TDistributionMap::const_iterator tt = to;
    --tt;
  
    float h = (refx - (*from).first);
    if ((*tt).first - refx  >  h)
      h = ((*tt).first - refx);

    /* Iterate through the window */

    tt = from;
    const float &x = (*tt).first;
    const PDistribution &y = (*tt).second;
    float cases = y->abs;

    float w = fabs(refx - x) / h;
    w = 1 - w*w*w;
    w = w*w*w;

    const float num = y->abs; // number of instances with this x - value
    float n = w * num;
    float Sww = w * w * num;

    float Sx = w * x * num;
    float Swwx  = w * w * x * num;
    float Swwxx = w * w * x * x * num;
    TDistribution *Sy = CLONE(TDistribution, y);
    PDistribution wSy = Sy;
    *Sy *= w;

    float Sxx = w * x * x * num;
    TDistribution *Syy = CLONE(TDistribution, y);
    PDistribution wSyy = Syy;
    *Syy *= w;

    TDistribution *Sxy = CLONE(TDistribution, y);
    PDistribution wSxy = Sxy;
    *Sxy *= w * x;

    if (tt!=to)
      while (++tt != to) {
        const float &x = (*tt).first;
        const PDistribution &y = (*tt).second;
        cases += y->abs;

        w = fabs(refx - x) / h;
        w = 1 - w*w*w;
        w = w*w*w;

        const float num = y->abs;
        n   += w * num;
        Sww += w * w * num;
        Sx  += w * x * num;
        Swwx += w * w * x * num;
        Swwxx += w * w * x * x * num;
        Sxx += w * x * x * num;

        TDistribution *ty = CLONE(TDistribution, y);
        PDistribution wty = ty;
        *ty *= w;
        *Sy  += wty;
        *Syy += wty;
        *ty *= x;
        *Sxy += wty;

        //*ty *= PDistribution(y);
      }

    float sigma_x2 = n<1e-6 ? 0.0 : (Sxx - Sx * Sx / n)/n;
    if (sigma_x2<1e-10) {
      *Sy *= 0;
      Sy->cases = cases;
      (*cont->continuous)[refx] = (wSy);
    }

    TDistribution *sigma_y2 = CLONE(TDistribution, Sy);
    PDistribution wsigma_y2 = sigma_y2;
    *sigma_y2 *= wsigma_y2;
    *sigma_y2 *= -1/n;
    *sigma_y2 += wSyy;
    *sigma_y2 *= 1/n;

    TDistribution *sigma_xy = CLONE(TDistribution, Sy);
    PDistribution wsigma_xy = sigma_xy;
    *sigma_xy *= -Sx/n;
    *sigma_xy += wSxy; 
    *sigma_xy *= 1/n;

    // This will be sigma_xy / sigma_x2, but we'll multiply it by whatever we need
    TDistribution *sigma_tmp = CLONE(TDistribution, sigma_xy);
    PDistribution wsigma_tmp = sigma_tmp;
    //*sigma_tmp *= wsigma_tmp;
    *sigma_tmp *= 1/sigma_x2;

    const float difx = refx - Sx/n;

    // computation of y
    *sigma_tmp *= difx;
    *Sy *= 1/n;
    *Sy += *sigma_tmp;

    // probabilities that are higher than 0.9 normalize with a logistic function, which produces two positive 
    // effects: prevents overfitting and avoids probabilities that are higher than 1.0. But, on the other hand, this 
    // solution is rather unmathematical. Do the same for probabilities that are lower than 0.1.

    vector<float>::iterator syi(((TDiscDistribution *)(Sy))->distribution.begin()); 
    vector<float>::iterator sye(((TDiscDistribution *)(Sy))->distribution.end()); 
    for (; syi!=sye; syi++) {
      if (*syi > 0.9) {
        Sy->abs -= *syi;
        *syi = 1/(1+exp(-10*((*syi)-0.9)*log(9.0)-log(9.0)));
        Sy->abs += *syi;
      }
      if (*syi < 0.1) {
        Sy->abs -= *syi;
        *syi = 1/(1+exp(10*(0.1-(*syi))*log(9.0)+log(9.0)));
        Sy->abs += *syi;
      }
    }

    Sy->cases = cases;
    Sy->normalize();
    (*cont->continuous)[refx] = (wSy);
 
    // now for the variance
    // restore sigma_tmp and compute the conditional sigma
    *sigma_tmp *= (1/difx);
    *sigma_tmp *= wsigma_xy;
    *sigma_tmp *= -1; 
    *sigma_tmp += wsigma_y2;
    // fct corresponds to part of (10) in the brackets (see URL above)
 //   float fct = Sww + difx*difx/sigma_x2/sigma_x2 * (Swwxx   - 2/n * Sx*Swwx   +  2/n/n * Sx*Sx*Sww);
    float fct = 1 + difx*difx/sigma_x2; //n + difx*difx/sigma_x2+n*n --- add this product to the overall fct sum if you are estimating error for a single user and not for the line.  
    *sigma_tmp *= fct/n; // fct/n/n;
    ((TDiscDistribution *)(Sy)) ->variances = mlnew TFloatList(((TDiscDistribution *)(sigma_tmp))->distribution);
    

    // on to the next point
    pi++;
    if (pi==pe)
      break; 

    refx = *pi;

    // Adjust the window
    while (to!=highedge) {
      float dif = (refx - (*from).first) - ((*to).first - refx);
      if ((dif>0) || (dif==0) && rgen.randbool()) {
        if (numOfOverflowing > 0) {
          from++;
          numOfOverflowing -= (*from).second->cases;
        }
        else {
          to++;
          if (to!=highedge) 
            numOfOverflowing += (*to).second->cases;
        }
      }
	    else
		    break;
    }
  }

  return mlnew TConditionalProbabilityEstimator_FromDistribution(cont);
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -