dist_clustering.cpp

来自「orange源码数据挖掘技术」· C++ 代码 · 共 570 行 · 第 1/2 页
CPP
570 行
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include "stladdon.hpp"

#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "learn.hpp"
#include "classify.hpp"
#include "table.hpp"
#include "measures.hpp"
#include "distance.hpp"

#include "dist_clustering.ppp"

TDistClusterNode::TDistClusterNode(PDistribution adis, const PExample &example, const float &quality, TDistClusterNode *aprevNode)
: nextNode(NULL),
  prevNode(aprevNode),
  distribution(adis),
  mergeProfits(NULL),
  cluster(mlnew TExampleCluster(example)),
  distributionQuality_N(quality)
{}


TDistClusterNode::~TDistClusterNode()
{ mldelete nextNode; }



TDistProfitNode::TDistProfitNode(TDistClusterNode *c1, TDistClusterNode *c2, const float &prof, const int &qind, const long &roff)
: cluster1(c1),
  cluster2(c2),
  profit(prof),
  queueIndex(qind),
  randoff(roff)
{}


TDistProfitNode::~TDistProfitNode()
{ mldelete it1;
  mldelete it2;
}


int TDistProfitNode::compare(const TDistProfitNode &other) const
{ if (profit<other.profit)
    return -1;
  else if (profit>other.profit)
    return 1;
  else if (randoff<other.randoff)
    return -1;
  else if (randoff>other.randoff)
    return 1;
  return 0;
}



T_ExampleDist::T_ExampleDist(PExample anexample, PDistribution adist)
: example(anexample),
  distribution(adist)
{}


int TExampleDistVector::traverse(visitproc visit, void *arg) const
{ TRAVERSE(TOrange::traverse);
  const_ITERATE(vector<T_ExampleDist>, pi, values) {
    PVISIT((*pi).example);
    PVISIT((*pi).distribution);
  }
  return 0;
}


int TExampleDistVector::dropReferences()
{ DROPREFERENCES(TOrange::dropReferences);
  values.clear();
  return 0;
}


PExampleDistVector TExampleDistBySorting::operator()(PExampleGenerator gen, TVarList &aboundSet, const int &weightID)
{
  // Identify bound attributes
  vector<int> bound;
  { ITERATE(TVarList, evi, aboundSet) {
      if ((*evi)->varType != TValue::INTVAR)
        raiseError("attribute '%s' is not discrete", (*evi)->name.c_str());
      bound.push_back(gen->domain->getVarNum(*evi));
    }
  }

  PDomain boundDomain = mlnew TDomain(PVariable(), aboundSet);

  TExampleTable sorted(gen, false);
  sorted.sort(bound);

  PExampleDistVector edv = mlnew TExampleDistVector();

  TDistribution *insertPoint = NULL;
  for(TExampleIterator ebegin(sorted.begin()), eend(sorted.end()), eprev(ebegin); ebegin!=eend; eprev=ebegin, ++ebegin) {
    if (insertPoint) { // we have it, but should still check for equality of bound attributes
      vector<int>::iterator bi(bound.begin()), be(bound.end());
      for( ; (bi!=be) && ((*ebegin)[*bi]==(*eprev)[*bi]); bi++);
      if (bi!=be)
        insertPoint=NULL;
    }

    if (!insertPoint) {
      edv->values.push_back(T_ExampleDist(PExample(mlnew TExample(boundDomain, *ebegin)), TDistribution::create(gen->domain->classVar)));
      insertPoint = const_cast<TDistribution *>(edv->values.back().distribution.getUnwrappedPtr());
    }
    insertPoint->add((*ebegin).getClass(), WEIGHT(*ebegin));
  }

  return edv;
}




TClustersFromDistributionsByAssessor::TClustersFromDistributionsByAssessor(float mpp, PDistributionAssessor acola)
: distributionAssessor(acola),
  minProfitProportion(mpp)
{}



void TClustersFromDistributionsByAssessor::computeQualities(TDistClusterNode *&clusters, TDistProfitQueue &profitQueue, float &baseQuality, float &N, TSimpleRandomGenerator &rgen)
// Computes errors and merge profits
{ profitQueue = TDistProfitQueue();
  baseQuality = 0.0;
  for(TDistClusterNode *cl1 = clusters; cl1; cl1=cl1->nextNode) {
    cl1->distributionQuality_N = distributionAssessor->distributionQuality(*cl1);
	  baseQuality += cl1->distributionQuality_N;
	  for(TDistClusterNode *cl2 = clusters; cl2!=cl1; cl2=cl2->nextNode) {
  	  float profit = distributionAssessor->mergeProfit(*cl1, *cl2);
      insertProfitQueueNode(cl2, cl1, profit, rgen.randsemilong(), profitQueue);
	  }
  }
}


TDistributionAssessor_Kramer defaultDistributionAssessor;

PExampleClusters TClustersFromDistributionsByAssessor::operator()(PExampleDistVector edv)
{
  bool defaultAssessorUsed = !distributionAssessor;
  if (defaultAssessorUsed)
    distributionAssessor = PDistributionAssessor(defaultDistributionAssessor);

  vector<PExampleCluster> group;

  float baseQuality;
  float N;
  TDistClusterNode *clusters = NULL;

  int nex = 0;
  ITERATE(vector<T_ExampleDist>, edvi, edv->values)
    nex += (*edvi).distribution->cases;

  TSimpleRandomGenerator rgen;

  try {
    TDistProfitQueue profitQueue;
    preparePrivateVars(edv, clusters, profitQueue, baseQuality, N, rgen);

    while(profitQueue.size() && (!stopCriterion || !stopCriterion->operator()(baseQuality, profitQueue, clusters)))
      mergeBestColumns(clusters, profitQueue, baseQuality, N, rgen);

    for(TDistClusterNode *cli = clusters; cli; cli = cli->nextNode)
      group.push_back(cli->cluster);
  }
  catch (...) {
    if (defaultAssessorUsed)
      distributionAssessor = PDistributionAssessor();
    mldelete clusters;
    clusters = NULL;
    throw;
  }

  mldelete clusters;
  clusters = NULL;
  if (defaultAssessorUsed)
    distributionAssessor = PDistributionAssessor();

  return mlnew TExampleClusters(PExampleCluster(mlnew TExampleCluster(group, numeric_limits<float>::infinity())), baseQuality);
}
  


void TClustersFromDistributionsByAssessor::preparePrivateVars(PExampleDistVector values, TDistClusterNode *&clusters, TDistProfitQueue &priorityQueue, float &baseQuality, float &N, TSimpleRandomGenerator &rgen)
{

  vector<T_ExampleDist>::iterator cli(values->values.begin()), cle(values->values.end());
  if (cli==cle)
    raiseError("empty 'ExampleDistVector'; no examples?!");

  TDistClusterNode *prevIns=clusters=mlnew TDistClusterNode((*cli).distribution, (*cli).example, 0.0, NULL);
  /* There was a bug here: the following line used to be
       PDistribution classDist=(*cli).distribution;
  */
  PDistribution classDist = CLONE(TDistribution, (*cli).distribution);
  while (++cli!=cle) {
    prevIns->nextNode=mlnew TDistClusterNode((*cli).distribution, (*cli).example, 0.0, prevIns);
    prevIns=prevIns->nextNode;
    classDist->operator += ((*cli).distribution.getReference());
  }
    
  N = classDist->abs;
  if (classDist->variable->varType==TValue::INTVAR)
    distributionAssessor->setDistribution(CAST_TO_DISCDISTRIBUTION(classDist));
  else
    distributionAssessor->setAverage(CAST_TO_CONTDISTRIBUTION(classDist).average());

  computeQualities(clusters, priorityQueue, baseQuality, N, rgen);
  baseQuality /= N;
}




TDistProfitNode *TClustersFromDistributionsByAssessor::insertProfitQueueNode(TDistClusterNode *cl1, TDistClusterNode *cl2, float profit, long roff, TDistProfitQueue &profitQueue)
{ TDistProfitNode *newNode = mlnew TDistProfitNode(cl1, cl2, profit, profitQueue.size(), roff);
  profitQueue.insert(newNode);
  newNode->it1 = mlnew TDistProfitNodeList(newNode, &cl1->mergeProfits);
  newNode->it2 = mlnew TDistProfitNodeList(newNode, &cl2->mergeProfits);
  return newNode;
}




void TClustersFromDistributionsByAssessor::mergeBestColumns(TDistClusterNode *&clusters, TDistProfitQueue &profitQueue, float &baseQuality, float &N, TSimpleRandomGenerator &rgen)
{
  TDistClusterNode *cl1 = profitQueue.front()->cluster1, *cl2 = profitQueue.front()->cluster2;
  const float &profitN = profitQueue.front()->profit;

  // merge the columns and update qualities
  cl1->cluster = mlnew TExampleCluster(cl1->cluster, cl2->cluster, -profitN/N);
  cl1->distribution ->operator+= (cl2->distribution);
  cl1->distributionQuality_N += cl2->distributionQuality_N - profitN;
  baseQuality += profitN/N;

  // delete cl2 from list of clusters
  if (cl2->nextNode)
    cl2->nextNode->prevNode = cl2->prevNode;

  if (cl2->prevNode)
    cl2->prevNode->nextNode = cl2->nextNode;
  else
    clusters = cl2->nextNode;

  cl2->prevNode = cl2->nextNode = NULL;

  // remove profits from the queue; the joint is removed in cl1.
  { TDistProfitNodeList &first = cl1->mergeProfits;
    while(first.next)
      profitQueue.remove(first.next->node->queueIndex);
  }
  { TDistProfitNodeList &first = cl2->mergeProfits;
    while(first.next)
      profitQueue.remove(first.next->node->queueIndex);
dist_clustering.cpp - 源码说明

本页面展示了「orange源码数据挖掘技术」中的 dist_clustering.cpp 源码文件，采用 C++ 编程语言编写，共 570 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与orange相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?