📄 dist_clustering.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
*/
#include "stladdon.hpp"
#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "learn.hpp"
#include "classify.hpp"
#include "table.hpp"
#include "measures.hpp"
#include "distance.hpp"
#include "dist_clustering.ppp"
TDistClusterNode::TDistClusterNode(PDistribution adis, const PExample &example, const float &quality, TDistClusterNode *aprevNode)
: nextNode(NULL),
prevNode(aprevNode),
distribution(adis),
mergeProfits(NULL),
cluster(mlnew TExampleCluster(example)),
distributionQuality_N(quality)
{}
TDistClusterNode::~TDistClusterNode()
{ mldelete nextNode; }
TDistProfitNode::TDistProfitNode(TDistClusterNode *c1, TDistClusterNode *c2, const float &prof, const int &qind, const long &roff)
: cluster1(c1),
cluster2(c2),
profit(prof),
queueIndex(qind),
randoff(roff)
{}
TDistProfitNode::~TDistProfitNode()
{ mldelete it1;
mldelete it2;
}
int TDistProfitNode::compare(const TDistProfitNode &other) const
{ if (profit<other.profit)
return -1;
else if (profit>other.profit)
return 1;
else if (randoff<other.randoff)
return -1;
else if (randoff>other.randoff)
return 1;
return 0;
}
T_ExampleDist::T_ExampleDist(PExample anexample, PDistribution adist)
: example(anexample),
distribution(adist)
{}
int TExampleDistVector::traverse(visitproc visit, void *arg) const
{ TRAVERSE(TOrange::traverse);
const_ITERATE(vector<T_ExampleDist>, pi, values) {
PVISIT((*pi).example);
PVISIT((*pi).distribution);
}
return 0;
}
int TExampleDistVector::dropReferences()
{ DROPREFERENCES(TOrange::dropReferences);
values.clear();
return 0;
}
PExampleDistVector TExampleDistBySorting::operator()(PExampleGenerator gen, TVarList &aboundSet, const int &weightID)
{
// Identify bound attributes
vector<int> bound;
{ ITERATE(TVarList, evi, aboundSet) {
if ((*evi)->varType != TValue::INTVAR)
raiseError("attribute '%s' is not discrete", (*evi)->name.c_str());
bound.push_back(gen->domain->getVarNum(*evi));
}
}
PDomain boundDomain = mlnew TDomain(PVariable(), aboundSet);
TExampleTable sorted(gen, false);
sorted.sort(bound);
PExampleDistVector edv = mlnew TExampleDistVector();
TDistribution *insertPoint = NULL;
for(TExampleIterator ebegin(sorted.begin()), eend(sorted.end()), eprev(ebegin); ebegin!=eend; eprev=ebegin, ++ebegin) {
if (insertPoint) { // we have it, but should still check for equality of bound attributes
vector<int>::iterator bi(bound.begin()), be(bound.end());
for( ; (bi!=be) && ((*ebegin)[*bi]==(*eprev)[*bi]); bi++);
if (bi!=be)
insertPoint=NULL;
}
if (!insertPoint) {
edv->values.push_back(T_ExampleDist(PExample(mlnew TExample(boundDomain, *ebegin)), TDistribution::create(gen->domain->classVar)));
insertPoint = const_cast<TDistribution *>(edv->values.back().distribution.getUnwrappedPtr());
}
insertPoint->add((*ebegin).getClass(), WEIGHT(*ebegin));
}
return edv;
}
TClustersFromDistributionsByAssessor::TClustersFromDistributionsByAssessor(float mpp, PDistributionAssessor acola)
: distributionAssessor(acola),
minProfitProportion(mpp)
{}
void TClustersFromDistributionsByAssessor::computeQualities(TDistClusterNode *&clusters, TDistProfitQueue &profitQueue, float &baseQuality, float &N, TSimpleRandomGenerator &rgen)
// Computes errors and merge profits
{ profitQueue = TDistProfitQueue();
baseQuality = 0.0;
for(TDistClusterNode *cl1 = clusters; cl1; cl1=cl1->nextNode) {
cl1->distributionQuality_N = distributionAssessor->distributionQuality(*cl1);
baseQuality += cl1->distributionQuality_N;
for(TDistClusterNode *cl2 = clusters; cl2!=cl1; cl2=cl2->nextNode) {
float profit = distributionAssessor->mergeProfit(*cl1, *cl2);
insertProfitQueueNode(cl2, cl1, profit, rgen.randsemilong(), profitQueue);
}
}
}
TDistributionAssessor_Kramer defaultDistributionAssessor;
PExampleClusters TClustersFromDistributionsByAssessor::operator()(PExampleDistVector edv)
{
bool defaultAssessorUsed = !distributionAssessor;
if (defaultAssessorUsed)
distributionAssessor = PDistributionAssessor(defaultDistributionAssessor);
vector<PExampleCluster> group;
float baseQuality;
float N;
TDistClusterNode *clusters = NULL;
int nex = 0;
ITERATE(vector<T_ExampleDist>, edvi, edv->values)
nex += (*edvi).distribution->cases;
TSimpleRandomGenerator rgen;
try {
TDistProfitQueue profitQueue;
preparePrivateVars(edv, clusters, profitQueue, baseQuality, N, rgen);
while(profitQueue.size() && (!stopCriterion || !stopCriterion->operator()(baseQuality, profitQueue, clusters)))
mergeBestColumns(clusters, profitQueue, baseQuality, N, rgen);
for(TDistClusterNode *cli = clusters; cli; cli = cli->nextNode)
group.push_back(cli->cluster);
}
catch (...) {
if (defaultAssessorUsed)
distributionAssessor = PDistributionAssessor();
mldelete clusters;
clusters = NULL;
throw;
}
mldelete clusters;
clusters = NULL;
if (defaultAssessorUsed)
distributionAssessor = PDistributionAssessor();
return mlnew TExampleClusters(PExampleCluster(mlnew TExampleCluster(group, numeric_limits<float>::infinity())), baseQuality);
}
void TClustersFromDistributionsByAssessor::preparePrivateVars(PExampleDistVector values, TDistClusterNode *&clusters, TDistProfitQueue &priorityQueue, float &baseQuality, float &N, TSimpleRandomGenerator &rgen)
{
vector<T_ExampleDist>::iterator cli(values->values.begin()), cle(values->values.end());
if (cli==cle)
raiseError("empty 'ExampleDistVector'; no examples?!");
TDistClusterNode *prevIns=clusters=mlnew TDistClusterNode((*cli).distribution, (*cli).example, 0.0, NULL);
/* There was a bug here: the following line used to be
PDistribution classDist=(*cli).distribution;
*/
PDistribution classDist = CLONE(TDistribution, (*cli).distribution);
while (++cli!=cle) {
prevIns->nextNode=mlnew TDistClusterNode((*cli).distribution, (*cli).example, 0.0, prevIns);
prevIns=prevIns->nextNode;
classDist->operator += ((*cli).distribution.getReference());
}
N = classDist->abs;
if (classDist->variable->varType==TValue::INTVAR)
distributionAssessor->setDistribution(CAST_TO_DISCDISTRIBUTION(classDist));
else
distributionAssessor->setAverage(CAST_TO_CONTDISTRIBUTION(classDist).average());
computeQualities(clusters, priorityQueue, baseQuality, N, rgen);
baseQuality /= N;
}
TDistProfitNode *TClustersFromDistributionsByAssessor::insertProfitQueueNode(TDistClusterNode *cl1, TDistClusterNode *cl2, float profit, long roff, TDistProfitQueue &profitQueue)
{ TDistProfitNode *newNode = mlnew TDistProfitNode(cl1, cl2, profit, profitQueue.size(), roff);
profitQueue.insert(newNode);
newNode->it1 = mlnew TDistProfitNodeList(newNode, &cl1->mergeProfits);
newNode->it2 = mlnew TDistProfitNodeList(newNode, &cl2->mergeProfits);
return newNode;
}
void TClustersFromDistributionsByAssessor::mergeBestColumns(TDistClusterNode *&clusters, TDistProfitQueue &profitQueue, float &baseQuality, float &N, TSimpleRandomGenerator &rgen)
{
TDistClusterNode *cl1 = profitQueue.front()->cluster1, *cl2 = profitQueue.front()->cluster2;
const float &profitN = profitQueue.front()->profit;
// merge the columns and update qualities
cl1->cluster = mlnew TExampleCluster(cl1->cluster, cl2->cluster, -profitN/N);
cl1->distribution ->operator+= (cl2->distribution);
cl1->distributionQuality_N += cl2->distributionQuality_N - profitN;
baseQuality += profitN/N;
// delete cl2 from list of clusters
if (cl2->nextNode)
cl2->nextNode->prevNode = cl2->prevNode;
if (cl2->prevNode)
cl2->prevNode->nextNode = cl2->nextNode;
else
clusters = cl2->nextNode;
cl2->prevNode = cl2->nextNode = NULL;
// remove profits from the queue; the joint is removed in cl1.
{ TDistProfitNodeList &first = cl1->mergeProfits;
while(first.next)
profitQueue.remove(first.next->node->queueIndex);
}
{ TDistProfitNodeList &first = cl2->mergeProfits;
while(first.next)
profitQueue.remove(first.next->node->queueIndex);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -