⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 estimateprob.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include "vars.hpp"
#include "contingency.hpp"
#include "examplegen.hpp"

#include "estimateprob.ppp"
#include "stat.hpp"
#include "random.hpp"


DEFINE_TOrangeVector_classDescription(PProbabilityEstimator, "TProbabilityEstimatorList", true, ORANGE_API)
DEFINE_TOrangeVector_classDescription(PConditionalProbabilityEstimator, "TConditionalProbabilityEstimatorList", true, ORANGE_API)

TProbabilityEstimator::TProbabilityEstimator(const bool &disc, const bool &cont)
: supportsDiscrete(disc),
  supportsContinuous(cont)
{}


PDistribution TProbabilityEstimator::operator()() const
{ return PDistribution(); }



TConditionalProbabilityEstimator::TConditionalProbabilityEstimator(const bool &disc, const bool &cont)
: supportsDiscrete(disc),
  supportsContinuous(cont)
{}


PContingency TConditionalProbabilityEstimator::operator()() const
{ return PContingency(); }



TProbabilityEstimator_FromDistribution::TProbabilityEstimator_FromDistribution(PDistribution af)
: TProbabilityEstimator(true, true),
  probabilities(af)
{ /* We try to check if we support discrete/continuous attributes;
     if we can't, we'll promise everything and blame the user for it
     (he should have given us the distributions) */
  if (probabilities) {
    if (probabilities.is_derived_from(TDiscDistribution))
      supportsContinuous = false;
    else if (probabilities.is_derived_from(TContDistribution))
      supportsDiscrete = false;
  }
}


float TProbabilityEstimator_FromDistribution::operator()(const TValue &classVal) const
{ checkProperty(probabilities);
  if (classVal.isSpecial())
    raiseError("undefined attribute value");
  
  /* This is a harmless shortcut to make things run faster in the most usual case */
  if (classVal.varType == TValue::INTVAR) {
    const TDiscDistribution *ddist = probabilities.AS(TDiscDistribution);
    if (ddist)
      return (*ddist)[classVal.intV];
    // else, let probabilities->p do something or (more probably) report an error
  }
  
  return probabilities->p(classVal);
}


PDistribution TProbabilityEstimator_FromDistribution::operator()() const
{ return CLONE(TDistribution, probabilities);
}



PProbabilityEstimator TProbabilityEstimatorConstructor_relative::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &, const int &) const
{ TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies));
  PProbabilityEstimator estimator = pefd;
  pefd->probabilities->normalize();
  return estimator;
}



TProbabilityEstimatorConstructor_Laplace::TProbabilityEstimatorConstructor_Laplace(const float &al, const bool &an)
: l(al),
  renormalize(an)
{}


PProbabilityEstimator TProbabilityEstimatorConstructor_Laplace::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &, const int &) const
{ TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies));
  PProbabilityEstimator estimator = pefd;
  
  TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution);
  if (ddist) {
    const float &abs = ddist->abs;
    const float &cases = ddist->cases;
    const float div = cases + l * ddist->noOfElements();
    int i = 0;
    if (div) {
      if ((cases == abs) || !renormalize || (abs<1e-20))
        PITERATE(TDiscDistribution, di, ddist)
          ddist->setint(i++, (*di + l) / div);
      else
        PITERATE(TDiscDistribution, di, ddist)
          ddist->setint(i++, (*di / abs * cases + l) / div);
    }
    else
      pefd->probabilities->normalize();
  }
  else
    pefd->probabilities->normalize();
  
  return estimator;
}



TProbabilityEstimatorConstructor_m::TProbabilityEstimatorConstructor_m(const float &am, const bool &an)
: m(am),
  renormalize(an)
{}


PProbabilityEstimator TProbabilityEstimatorConstructor_m::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const
{ TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies));
  PProbabilityEstimator estimator = pefd;
  
  TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution);  
  if (ddist && (ddist->cases > 1e-20) && apriori) {
    TDiscDistribution *dapriori = apriori.AS(TDiscDistribution);
    if (!dapriori || (dapriori->abs < 1e-20))
      raiseError("invalid apriori distribution");
    
    float mabs = m/dapriori->abs;
    const float &abs = ddist->abs;
    const float &cases = ddist->cases;
    const float div = cases + m;
    if ((abs==cases) || !renormalize) {
      int i = 0;
      for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin());
          di != de;
          di++, ai++, i++)
         ddist->setint(i, (*di+*ai*mabs)/div);
    }
    else {
      int i = 0;
      for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin());
          di != de;
          di++, ai++, i++)
         ddist->setint(i, (*di / abs * cases + *ai*mabs)/div);
    }
  }
  else
    pefd->probabilities->normalize();
    
  return estimator;
}



TProbabilityEstimatorConstructor_kernel::TProbabilityEstimatorConstructor_kernel(const float &minImp, const float &smoo, const int &nP)
: minImpact(minImp),
  smoothing(smoo),
  nPoints(nP)
{}


PProbabilityEstimator TProbabilityEstimatorConstructor_kernel::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const
{ TContDistribution *cdist = frequencies.AS(TContDistribution);
  if (!cdist)
    raiseError("continuous distribution expected");
  if (!cdist->size())
    raiseError("empty distribution");
  if ((minImpact<0.0) || (minImpact>1.0))
    raiseError("'minImpact' should be between 0.0 and 1.0 (not %5.3f)", minImpact);

  vector<float> points;
  distributePoints(cdist->distribution, nPoints, points);

  TContDistribution *curve = mlnew TContDistribution(frequencies->variable);
  PDistribution wcurve = curve;

  /* Bandwidth suggested by Chad Shaw. Also found in http://www.stat.lsa.umich.edu/~kshedden/Courses/Stat606/Notes/interpolate.pdf */
  const float h = smoothing * sqrt(cdist->error()) * exp(- 1.0/5.0 * log(cdist->abs)); // 1.144
  const float hsqrt2pi = h * 2.5066282746310002;
  float t;

  if (minImpact>0) {
    t = -2 * log(minImpact*hsqrt2pi); // 2.5066... == sqrt(2*pi)
    if (t<=0) {
      // minImpact too high, but that's user's problem... 
      ITERATE(vector<float>, pi, points)
        curve->setfloat(*pi, 0.0);
        return wcurve;
    }
    else
      t = h * sqrt(t);
  }
      
      
  ITERATE(vector<float>, pi, points) {
    const float &x = *pi;
    TContDistribution::const_iterator from, to;

    if (minImpact>0) {
      from = cdist->lower_bound(x-t);
      to = cdist->lower_bound(x+t);
      if ((from==cdist->end()) || (to==cdist->begin()) || (from==to)) {
        curve->setfloat(x, 0.0);
        continue;
      }
    }
    else {
      from = cdist->begin();
      to = cdist->end();
    }

    float p = 0.0, n = 0.0;
    for(; from != to; from++) {
      n += (*from).second;
      p += (*from).second * exp( - 0.5 * sqr( (x - (*from).first)/h ) );
    }

    curve->setfloat(x, p/hsqrt2pi/(n*h)); // hsqrt2pi is from the inside (errf), n*h is for the sum average
  }


  return mlnew TProbabilityEstimator_FromDistribution(curve);
}



TProbabilityEstimatorConstructor_loess::TProbabilityEstimatorConstructor_loess(const float &windowProp, const int &ak)
: windowProportion(windowProp),
  nPoints(ak),
  distributionMethod(DISTRIBUTE_MINIMAL)
{}



PProbabilityEstimator TProbabilityEstimatorConstructor_loess::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &weightID, const int &attrNo) const
{ TContDistribution *cdist = frequencies.AS(TContDistribution);
  if (!cdist)
    if (frequencies && frequencies->variable)
      raiseError("attribute '%s' is not continuous", cdist->variable->name.c_str());
    else
      raiseError("continuous distribution expected");
  if (!cdist->size())
    raiseError("empty distribution");

  map<float, float> loesscurve;
  loess(cdist->distribution, nPoints, windowProportion, loesscurve, distributionMethod);
  return mlnew TProbabilityEstimator_FromDistribution(mlnew TContDistribution(loesscurve));
}



TConditionalProbabilityEstimator_FromDistribution::TConditionalProbabilityEstimator_FromDistribution(PContingency cont)
: TConditionalProbabilityEstimator(true, true),
  probabilities(cont)
{ if (probabilities) {
    supportsContinuous = (probabilities->varType == TValue::FLOATVAR);
    supportsDiscrete = (probabilities->varType == TValue::INTVAR);
  }
}


float TConditionalProbabilityEstimator_FromDistribution::operator()(const TValue &val, const TValue &condition) const
{ if (condition.varType == TValue::INTVAR)
    return probabilities->operator[](condition)->operator[](val);

  else if (condition.varType == TValue::FLOATVAR) {
    if (condition.isSpecial() || val.isSpecial())
      raiseError("undefined attribute value for condition");
    if (probabilities->varType != TValue::FLOATVAR)
      raiseError("invalid attribute type for condition");

    const TDistributionMap *dm = probabilities->continuous;
    const float &x = condition.floatV;
    TDistributionMap::const_iterator rb = dm->upper_bound(x);
    if (rb==dm->end())
      return 0.0;
    if ((*rb).first==x)
      return (*rb).second->operator[](val);
    if (rb==dm->begin())
      return 0.0;

    const float &x2 = (*rb).first, &y2 = (*rb).second->operator[](val);
    rb--;
    const float &x1 = (*rb).first, &y1 = (*rb).second->operator[](val);

    if (x1 == x2)
      return (y1+y2)/2;

    return y1 + (x - x1) * (y2 - y1) / (x2 - x1);
  }

  raiseError("invalid attribute type for condition");
  return 0.0;
}


PDistribution TConditionalProbabilityEstimator_FromDistribution::operator()(const TValue &condition) const
{ if (condition.varType == TValue::INTVAR)
    return probabilities->operator[](condition);

  else if (condition.varType == TValue::FLOATVAR) {
    if (condition.isSpecial())
      raiseError("undefined attribute value for condition");
    if (probabilities->varType != TValue::FLOATVAR)
      raiseError("invalid attribute value type for condition");

    const float &x = condition.floatV;
    const TDistributionMap *dm = probabilities->continuous;
    TDistributionMap::const_iterator rb = dm->upper_bound(x);
    if (rb==dm->end())
      rb = dm->begin();
    
    TDistribution *result = CLONE(TDistribution, (*rb).second);
    PDistribution wresult = result;

    if ((rb==dm->begin()) && ((*rb).first!=x)) {
      (*result) *= 0;
      return wresult;
    }

    const float &x2 = (*rb).first;
    rb--;
    const float &x1 = (*rb).first;
    const PDistribution &y1 = (*rb).second;

    if (x1 == x2) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -