📄 estimateprob.cpp
字号:
*result += y1;
*result *= 0.5;
return wresult;
}
// The normal formula for this is in the function above
*result -= y1;
*result *= (x-x1)/(x2-x1);
*result += y1;
return wresult;
}
raiseError("invalid attribute value for condition");
return PDistribution();
}
PContingency TConditionalProbabilityEstimator_FromDistribution::operator()() const
{ return CLONE(TContingency, probabilities); }
void TConditionalProbabilityEstimator_ByRows::checkCondition(const TValue &condition) const
{ checkProperty(estimatorList);
if (!estimatorList->size())
raiseError("empty 'estimatorList'");
if (condition.isSpecial())
raiseError("undefined attribute value for condition");
if (condition.varType != TValue::INTVAR)
raiseError("value for condition is not discrete");
if (condition.intV >= estimatorList->size())
raiseError("value for condition out of range");
}
float TConditionalProbabilityEstimator_ByRows::operator()(const TValue &val, const TValue &condition) const
{ checkCondition(condition);
return estimatorList->operator[](condition.intV)->call(val);
}
PDistribution TConditionalProbabilityEstimator_ByRows::operator()(const TValue &condition) const
{ checkCondition(condition);
return estimatorList->operator[](condition.intV)->call();
}
TConditionalProbabilityEstimatorConstructor_ByRows::TConditionalProbabilityEstimatorConstructor_ByRows(PProbabilityEstimatorConstructor pec)
: estimatorConstructor(pec)
{}
PConditionalProbabilityEstimator TConditionalProbabilityEstimatorConstructor_ByRows::operator()(PContingency frequencies, PDistribution apriori, PExampleGenerator gen, const long &weightID, const int &attrNo) const
{ if (!frequencies)
frequencies = mlnew TContingencyAttrClass(gen, weightID, attrNo);
if (frequencies->varType != TValue::INTVAR)
if (frequencies->outerVariable)
raiseError("attribute '%s' is not discrete", frequencies->outerVariable->name.c_str());
else
raiseError("discrete attribute for condition expected");
/* We first try to construct a list of Distributions; if we suceed, we'll return an instance of
TConditionProbabilityEstimator_FromDistribution. If we fail, we'll construct an instance of
TConditionProbabilityEstimator_ByRows. */
// This list stores conditional estimators for the case we fail
PProbabilityEstimatorList cpel = mlnew TProbabilityEstimatorList();
PContingency newcont = mlnew TContingencyAttrClass(frequencies->outerVariable, frequencies->innerVariable);
TDistributionVector::const_iterator fi(frequencies->discrete->begin()), fe(frequencies->discrete->end());
for (int i = 0; fi!=fe; fi++, i++) {
PProbabilityEstimator est = estimatorConstructor->call(*fi, apriori, PExampleGenerator(), 0, attrNo);
cpel->push_back(est);
PDistribution dist = est->call();
if (!dist)
break;
if (i >= newcont->discrete->size())
newcont->discrete->push_back(dist);
else
newcont->discrete->operator[](i) = dist;
}
if (fi==fe)
return mlnew TConditionalProbabilityEstimator_FromDistribution(newcont);
/* We failed at constructing a matrix of probabilites. We'll just complete the list of estimators. */
for (; fi!=fe; fi++)
cpel->push_back(estimatorConstructor->call(*fi, apriori, gen, weightID));
TConditionalProbabilityEstimator_ByRows *cbr = mlnew TConditionalProbabilityEstimator_ByRows();
PConditionalProbabilityEstimator wcbr = cbr;
cbr->estimatorList = cpel;
return wcbr;
}
TConditionalProbabilityEstimatorConstructor_loess::TConditionalProbabilityEstimatorConstructor_loess(const float &windowProp, const int &ak)
: windowProportion(windowProp),
nPoints(ak),
distributionMethod(DISTRIBUTE_MINIMAL)
{}
PConditionalProbabilityEstimator TConditionalProbabilityEstimatorConstructor_loess::operator()(PContingency frequencies, PDistribution, PExampleGenerator, const long &, const int &) const
{ if (frequencies->varType != TValue::FLOATVAR)
if (frequencies->outerVariable)
raiseError("attribute '%s' is not continuous", frequencies->outerVariable->name.c_str());
else
raiseError("continuous attribute expected for condition");
if (!frequencies->continuous->size())
raiseError("empty distribution");
PContingency cont = CLONE(TContingency, frequencies);
const TDistributionMap &points = *frequencies->continuous;
vector<float> xpoints;
distributePoints(points, nPoints, xpoints, distributionMethod);
if (!points.size())
raiseError("no points for the curve (check 'nPoints')");
TDistributionMap::const_iterator lowedge = points.begin();
TDistributionMap::const_iterator highedge = points.end();
bool needAll;
map<float, PDistribution>::const_iterator from, to;
vector<float>::const_iterator pi(xpoints.begin()), pe(xpoints.end());
float refx = *pi;
from = lowedge;
to = highedge;
int totalNumOfPoints = frequencies->outerDistribution->abs;
int needpoints = int(ceil(totalNumOfPoints * windowProportion));
if (needpoints<3)
needpoints = 3;
TSimpleRandomGenerator rgen(frequencies->outerDistribution->cases);
if ((needpoints<=0) || (needpoints>=totalNumOfPoints)) { //points.size()
needAll = true;
from = lowedge;
to = highedge;
}
else {
needAll = false;
/* Find the window */
from = points.lower_bound(refx);
to = points.upper_bound(refx);
if (from==to)
if (to != highedge)
to++;
else
from --;
/* Extend the interval; we set from to highedge when it would go beyond lowedge, to indicate that only to can be modified now */
while (needpoints) {
if ((to == highedge) || ((from != highedge) && (refx - (*from).first < (*to).first - refx))) {
if (from == lowedge)
from = highedge;
else {
from--;
needpoints -= (*from).second->cases;
}
}
else {
to++;
if (to!=highedge)
needpoints -= (*to).second->cases;
else
needpoints = 0;
}
}
if (from == highedge)
from = lowedge;
else
from++;
}
int numOfOverflowing = 0;
// This follows http://www-2.cs.cmu.edu/afs/cs/project/jair/pub/volume4/cohn96a-html/node7.html
for(;;) {
TDistributionMap::const_iterator tt = to;
--tt;
float h = (refx - (*from).first);
if ((*tt).first - refx > h)
h = ((*tt).first - refx);
/* Iterate through the window */
tt = from;
const float &x = (*tt).first;
const PDistribution &y = (*tt).second;
float cases = y->abs;
float w = fabs(refx - x) / h;
w = 1 - w*w*w;
w = w*w*w;
const float num = y->abs; // number of instances with this x - value
float n = w * num;
float Sww = w * w * num;
float Sx = w * x * num;
float Swwx = w * w * x * num;
float Swwxx = w * w * x * x * num;
TDistribution *Sy = CLONE(TDistribution, y);
PDistribution wSy = Sy;
*Sy *= w;
float Sxx = w * x * x * num;
TDistribution *Syy = CLONE(TDistribution, y);
PDistribution wSyy = Syy;
*Syy *= w;
TDistribution *Sxy = CLONE(TDistribution, y);
PDistribution wSxy = Sxy;
*Sxy *= w * x;
if (tt!=to)
while (++tt != to) {
const float &x = (*tt).first;
const PDistribution &y = (*tt).second;
cases += y->abs;
w = fabs(refx - x) / h;
w = 1 - w*w*w;
w = w*w*w;
const float num = y->abs;
n += w * num;
Sww += w * w * num;
Sx += w * x * num;
Swwx += w * w * x * num;
Swwxx += w * w * x * x * num;
Sxx += w * x * x * num;
TDistribution *ty = CLONE(TDistribution, y);
PDistribution wty = ty;
*ty *= w;
*Sy += wty;
*Syy += wty;
*ty *= x;
*Sxy += wty;
//*ty *= PDistribution(y);
}
float sigma_x2 = n<1e-6 ? 0.0 : (Sxx - Sx * Sx / n)/n;
if (sigma_x2<1e-10) {
*Sy *= 0;
Sy->cases = cases;
(*cont->continuous)[refx] = (wSy);
}
TDistribution *sigma_y2 = CLONE(TDistribution, Sy);
PDistribution wsigma_y2 = sigma_y2;
*sigma_y2 *= wsigma_y2;
*sigma_y2 *= -1/n;
*sigma_y2 += wSyy;
*sigma_y2 *= 1/n;
TDistribution *sigma_xy = CLONE(TDistribution, Sy);
PDistribution wsigma_xy = sigma_xy;
*sigma_xy *= -Sx/n;
*sigma_xy += wSxy;
*sigma_xy *= 1/n;
// This will be sigma_xy / sigma_x2, but we'll multiply it by whatever we need
TDistribution *sigma_tmp = CLONE(TDistribution, sigma_xy);
PDistribution wsigma_tmp = sigma_tmp;
//*sigma_tmp *= wsigma_tmp;
*sigma_tmp *= 1/sigma_x2;
const float difx = refx - Sx/n;
// computation of y
*sigma_tmp *= difx;
*Sy *= 1/n;
*Sy += *sigma_tmp;
// probabilities that are higher than 0.9 normalize with a logistic function, which produces two positive
// effects: prevents overfitting and avoids probabilities that are higher than 1.0. But, on the other hand, this
// solution is rather unmathematical. Do the same for probabilities that are lower than 0.1.
vector<float>::iterator syi(((TDiscDistribution *)(Sy))->distribution.begin());
vector<float>::iterator sye(((TDiscDistribution *)(Sy))->distribution.end());
for (; syi!=sye; syi++) {
if (*syi > 0.9) {
Sy->abs -= *syi;
*syi = 1/(1+exp(-10*((*syi)-0.9)*log(9.0)-log(9.0)));
Sy->abs += *syi;
}
if (*syi < 0.1) {
Sy->abs -= *syi;
*syi = 1/(1+exp(10*(0.1-(*syi))*log(9.0)+log(9.0)));
Sy->abs += *syi;
}
}
Sy->cases = cases;
Sy->normalize();
(*cont->continuous)[refx] = (wSy);
// now for the variance
// restore sigma_tmp and compute the conditional sigma
*sigma_tmp *= (1/difx);
*sigma_tmp *= wsigma_xy;
*sigma_tmp *= -1;
*sigma_tmp += wsigma_y2;
// fct corresponds to part of (10) in the brackets (see URL above)
// float fct = Sww + difx*difx/sigma_x2/sigma_x2 * (Swwxx - 2/n * Sx*Swwx + 2/n/n * Sx*Sx*Sww);
float fct = 1 + difx*difx/sigma_x2; //n + difx*difx/sigma_x2+n*n --- add this product to the overall fct sum if you are estimating error for a single user and not for the line.
*sigma_tmp *= fct/n; // fct/n/n;
((TDiscDistribution *)(Sy)) ->variances = mlnew TFloatList(((TDiscDistribution *)(sigma_tmp))->distribution);
// on to the next point
pi++;
if (pi==pe)
break;
refx = *pi;
// Adjust the window
while (to!=highedge) {
float dif = (refx - (*from).first) - ((*to).first - refx);
if ((dif>0) || (dif==0) && rgen.randbool()) {
if (numOfOverflowing > 0) {
from++;
numOfOverflowing -= (*from).second->cases;
}
else {
to++;
if (to!=highedge)
numOfOverflowing += (*to).second->cases;
}
}
else
break;
}
}
return mlnew TConditionalProbabilityEstimator_FromDistribution(cont);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -