📄 dist_clustering.cpp
字号:
}
// update the column error and the profits
{ for(TDistClusterNode *cn1 = clusters; cn1; cn1 = cn1->nextNode)
if (cn1!=cl1)
insertProfitQueueNode(cl1, cn1, distributionAssessor->mergeProfit(*cn1, *cl1), rgen.randsemilong(), profitQueue);
}
delete cl2;
}
TFeatureByDistributions::TFeatureByDistributions(PClustersFromDistributions cfd, const int &comp)
: clustersFromDistributions(cfd),
completion(comp)
{}
TExampleDistBySorting defaultEDC;
TClustersFromDistributionsByAssessor defaultCFD;
PVariable TFeatureByDistributions::operator()(PExampleGenerator egen, TVarList &boundSet, const string &name, float &quality, const int &weight)
{
PExampleDistVector edv = defaultEDC(egen, boundSet, weight);
if (!edv->values.size())
return PVariable();
PExampleClusters clusters = clustersFromDistributions ? clustersFromDistributions->call(edv) : defaultCFD(edv);
PVariable feat = clusters->feature(float(1e30), completion);
if (!feat)
return PVariable();
quality = clusters->quality;
feat->name = name;
return feat;
}
TDistributionAssessor::TDistributionAssessor()
{}
void TDistributionAssessor::setDistribution(const TDiscDistribution &)
{}
void TDistributionAssessor::setAverage(const float &)
{}
void TDistributionAssessor_m::setDistribution(const TDiscDistribution &classDist)
{ p_by_m = vector<float>();
float N = classDist.abs;
const_ITERATE(TDiscDistribution, ci, classDist)
p_by_m.push_back(*ci/N*m);
}
TDistributionAssessor_m::TDistributionAssessor_m(const float &am)
: m(am)
{}
float TDistributionAssessor_m::m_error(const TDiscDistribution &val) const
// returns m estimate for error, multiplied by number of examples
{
float bestok=-1, thisok;
vector<float>::const_iterator pci(p_by_m.begin());
const_ITERATE(TDiscDistribution, dvi, val)
if ((thisok = (*dvi + *(pci++))) > bestok)
bestok = thisok;
return val.abs * (1 - bestok) / (val.abs + m);
}
float TDistributionAssessor_m::m_error(const TDiscDistribution &val1, const TDiscDistribution &val2) const
// returns m estimate for error, summing the given distributions, multiplied by
// the joint number of examples
{
float bestok=-1, thisok;
float N=val1.abs+val2.abs;
vector<float>::const_iterator pci(p_by_m.begin());
for(TDiscDistribution::const_iterator dvi1(val1.begin()), dve1(val1.end()), dvi2(val2.begin());
dvi1!=dve1; dvi1++, dvi2++)
if ((thisok = (*dvi1+*dvi2+ *(pci++)) / (N + m))>bestok) bestok=thisok;
return N * (1 - bestok);
}
float TDistributionAssessor_m::distributionQuality(TDistClusterNode &node) const
{ return -m_error(CAST_TO_DISCDISTRIBUTION(node.distribution)); }
/* The profit is not divided by the total number of examples so that the function returns the
profit multiplied by the number of examples in the merged column. */
float TDistributionAssessor_m::mergeProfit(const TDistClusterNode &clust1, const TDistClusterNode &clust2) const
{ return - m_error(CAST_TO_DISCDISTRIBUTION(clust1.distribution), CAST_TO_DISCDISTRIBUTION(clust2.distribution))
+ (clust1.distributionQuality_N + clust2.distributionQuality_N);
}
float TDistributionAssessor_Relief::distributionQuality(TDistClusterNode &node) const
{ const TDiscDistribution &dist=CAST_TO_DISCDISTRIBUTION(node.distribution);
float sum=0.0;
const_ITERATE(TDiscDistribution, di, dist)
sum += *di * *di;
return 2*sum-dist.abs*dist.abs;
}
float TDistributionAssessor_Relief::mergeProfit(const TDistClusterNode &clust1, const TDistClusterNode &clust2) const
{
const TDiscDistribution &dist1= CAST_TO_DISCDISTRIBUTION(clust1.distribution),
&dist2= CAST_TO_DISCDISTRIBUTION(clust2.distribution);
float profit=0.0;
for(TDiscDistribution::const_iterator v1i(dist1.begin()), v1e(dist1.end()), v2i(dist2.begin()), v2e(dist2.end());
(v1i!=v1e) && (v2i!=v2e);
profit += 4 * *(v1i++) * *(v2i++));
profit -= 2 * dist1.abs * dist2.abs;
return profit;
}
float TDistributionAssessor_mf::m_error(const float &sum, const float &sum2, const float &N) const
{ float df = sum+m*aprior;
float N_m = N+m;
return N/N_m * (sum2 + m*aprior*aprior - df*df/N_m);
}
TDistributionAssessor_mf::TDistributionAssessor_mf(const float &am)
: m(am)
{}
void TDistributionAssessor_mf::setAverage(const float &avg)
{ aprior = avg; }
float TDistributionAssessor_mf::distributionQuality(TDistClusterNode &node) const
{ return -m_error(node.distribution.AS(TContDistribution)->sum,
node.distribution.AS(TContDistribution)->sum2,
node.distribution->abs); }
/* The profit is not divided by the total number of examples so that the function returns the
profit multiplied by the number of examples in the merged column. */
float TDistributionAssessor_mf::mergeProfit(const TDistClusterNode &clust1, const TDistClusterNode &clust2) const
{
return clust1.distributionQuality_N + clust2.distributionQuality_N
- m_error(CAST_TO_CONTDISTRIBUTION(clust1.distribution).sum+CAST_TO_CONTDISTRIBUTION(clust2.distribution).sum,
CAST_TO_CONTDISTRIBUTION(clust1.distribution).sum2+CAST_TO_CONTDISTRIBUTION(clust2.distribution).sum2,
clust1.distribution->abs+clust2.distribution->abs);
}
TDistributionAssessor_Measure::TDistributionAssessor_Measure(PMeasureAttribute meas)
: measure(meas)
{}
float TDistributionAssessor_Measure::distributionQuality(TDistClusterNode &node) const
{ return measure->operator()(node.distribution); }
float TDistributionAssessor_Measure::mergeProfit(const TDistClusterNode &clust1, const TDistClusterNode &clust2) const
{ if (clust1.distribution->variable->varType==TValue::INTVAR) {
TDiscDistribution nd (CAST_TO_DISCDISTRIBUTION(clust1.distribution));
nd += clust2.distribution;
return measure->operator()(nd);
}
else
raiseError("merging of continuous attributes not implemented");
return 0.0;
}
float TDistributionAssessor_Laplace::distributionQuality(TDistClusterNode &node) const
{ const TDiscDistribution &dist=CAST_TO_DISCDISTRIBUTION(node.distribution);
const float N = dist.abs;
const float Nc = dist.size();
const float error = dist.size() ? (1 - (dist.highestProb()+1)/(N+Nc)) : 0.0;
return - N * error;
}
float TDistributionAssessor_Laplace::mergeProfit(const TDistClusterNode &clust1, const TDistClusterNode &clust2) const
{
const TDiscDistribution &dist1=CAST_TO_DISCDISTRIBUTION(clust1.distribution),
&dist2=CAST_TO_DISCDISTRIBUTION(clust2.distribution);
if (!dist1.size() && !dist2.size())
return 0.0;
float maxC = 0.0;
for(TDiscDistribution::const_iterator di1(dist1.begin()), di2(dist2.begin()), de1(dist1.end()), de2(dist2.end());
(di1!=de1) && (di2!=de2);
di1++, di2++)
if (*di1+*di2 > maxC)
maxC = *di1+*di2;
const float Nc = (dist1.size()>dist2.size()) ? dist1.size() : dist2.size();
const float N = dist1.abs + dist2.abs;
const float newError = 1 - (maxC+1)/(Nc+N);
const float newQuality = -newError;
return N * newQuality - (clust1.distributionQuality_N + clust2.distributionQuality_N);
}
float TDistributionAssessor_Kramer::distributionQuality(TDistClusterNode &node) const
{ const TDiscDistribution &dist=CAST_TO_DISCDISTRIBUTION(node.distribution);
if (dist.size()>2)
raiseError("binary class expected");
return (dist.size()==2) ? - dist[0]*dist[1]/(dist[0]+dist[1]) : 0.0;
}
float TDistributionAssessor_Kramer::mergeProfit(const TDistClusterNode &clust1, const TDistClusterNode &clust2) const
{
const TDiscDistribution &dist1=CAST_TO_DISCDISTRIBUTION(clust1.distribution),
&dist2=CAST_TO_DISCDISTRIBUTION(clust2.distribution);
const float &p1=dist1.front();
const float &n1=dist1.back();
const float &p2=dist2.front();
const float &n2=dist2.back();
return -(p1+p2)*(n1+n2) / (p1+p2+n1+n2) - (clust1.distributionQuality_N + clust2.distributionQuality_N);
}
TStopDistributionClustering_noProfit::TStopDistributionClustering_noProfit(const float &minprof)
: minProfitProportion(minprof)
{}
bool TStopDistributionClustering_noProfit::operator()(const float &baseQuality, const TDistProfitQueue &pq, const TDistClusterNode *) const
{ return (pq.front()->profit < 0) || (pq.front()->profit<baseQuality*minProfitProportion); };
bool TStopDistributionClustering_noBigChange::operator()(const float &, const TDistProfitQueue &profitQueue, const TDistClusterNode *) const
{ int pN = profitQueue.size();
if (pN>1) {
float sum = 0.0;
int i = 0;
while(i<pN/2)
sum += profitQueue[i++]->profit;
if (pN%2)
i++;
while(i<pN)
sum -= profitQueue[i++]->profit;
sum /= (pN-pN%2);
if (profitQueue.front()->profit < -sum)
return true;
}
else if (profitQueue.front()->profit < 0)
return true;
return false;
}
bool TStopDistributionClustering_binary::operator()(const float &, const TDistProfitQueue &, const TDistClusterNode *clusters) const
{ return (!clusters || !clusters->nextNode || !clusters->nextNode->nextNode); }
TStopDistributionClustering_n::TStopDistributionClustering_n(const int &an)
: n(an)
{}
bool TStopDistributionClustering_n::operator()(const float &, const TDistProfitQueue &, const TDistClusterNode *clusters) const
{ TDistClusterNode const *cn = clusters;
for (int i = n; cn && i; i--, cn = cn->nextNode);
return !cn;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -