📄 ltkhierarchicalclustering.h
字号:
/*****************************************************************************************
* Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
* OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*****************************************************************************************/
/************************************************************************
* FILE DESCR: Definitions of Agglomerative Hierarchical Clustering module
*
* CONTENTS:
* cluster
* getProximityMatrix
* setOutputConfig
* setHyperlinkMap
* getClusterResult
* computeProximityMatrix
* computeDistances
* clusterToFindNumClusters
* getInterObjectDistance
* findGroup
* findInterClusterDistance
* writeClustersAsHTML
* determineNumOfClusters
* determineKnee
* findRMSE
* computeAvgSil
*
*
* AUTHOR: Bharath A
*
* DATE: February 22, 2005
* CHANGE HISTORY:
* Author Date Description of change
************************************************************************/
#ifndef __LTKHIERARCHICALCLUSTERING_H
#define __LTKHIERARCHICALCLUSTERING_H
#ifndef _WIN32
#include <values.h>
#endif
#include "LTKInc.h"
#include "LTKTypes.h"
#include "LTKLoggerUtil.h"
#include "LTKException.h"
#include "LTKErrors.h"
/*Enumerator for stopping criterion to be used*/
enum ELTKHCStoppingCriterion
{
LMETHOD,
AVG_SIL
};
/*Enumerator for methods in hierarchical clustering*/
enum ELTKHCMethod
{
SINGLE_LINKAGE,
COMPLETE_LINKAGE,
AVERAGE_LINKAGE
};
#define OUTPUT_HTML_FILE_NAME "output.html"
#define MIN_CUTOFF 20
/**
* @class LTKHierarchicalClustering
* <p> This class does agglomerative hierarchical clustering. The data objects
which could be LTKTrace or LTKTraceGroup, are supplied as a vector.
Function that defines the distance between two data objects needs to be
supplied as a function pointer.One of the 3 methods (Single,Average or
Complete linkage) needs to be selected to define the way inter-cluster
has to be determined. In case number of clusters is not supplied,
it is determined using the L-method (default stopping criterion)<p> */
template <class ClusterObjType,class DistanceClass>
class LTKHierarchicalClustering
{
private:
//reference to the vector containing the data objects to be clustered
const vector<ClusterObjType>& m_data;
//triangular matrix containing the pairwise distances between data
//objects
float2DVector m_proximityMatrix;
//data structure that stores current (intermediate) state of the
//clusters
int2DVector m_intermediateCG;
//vector mapping the data object id and path to the data (unipen) file
stringVector m_hyperlinksVec;
//contains the number of clusters required
int m_numOfClusters;
//output file handle to write the cluster results as html
//with name of the file as OUTPUT_HTML_FILE_NAME
ofstream m_output;
//flag to indicate whether the output is required as html
bool m_writeHTML;
//flag to indicate whether to show all levels of the hierarchy in the
//html file
bool m_showAllLevels;
//vector to hold merging distance for each number of clusters
floatVector m_mergingDist;
//flag for determining number of clusters
bool m_determineClusters;
//output result directory
string m_outputDir;
//extension of the image corresponding to each data object in
//order to write in the html
string m_imageFileExtn;
//Method for defining inter-cluster distance
ELTKHCMethod m_method;
//number of clusters determined by Average Silhouette method
int m_numBySil;
//cached clustering result corresponding minimum average silhouette
int2DVector m_cachedResult;
//stopping criterion selected - LMethod or Average Silhouette
ELTKHCStoppingCriterion m_stoppingCriterion;
//pointer to the class that has the definition of distance function
DistanceClass* m_distClassPtr;
//function pointer type of the function that defines inter-object distance
typedef int (DistanceClass::*FN_PTR_DISTANCE)(const ClusterObjType&,
const ClusterObjType&,
float&) ;
//distance function pointer
FN_PTR_DISTANCE m_distancePtr;
public:
/**********************************************************************************
* AUTHOR : Bharath A
* DATE : 22-FEB-2005
* NAME : LTKHierarchicalClustering
* DESCRIPTION : Constructor to initialise the required parameters
* ARGUMENTS : clusterObjects - vector of data objects which could be LKTrace or
* LTKTraceGroup for HWR
* noOfClusters - Number of clusters required
* clusteringMethod - One of the 3 methods:
* SINGLE_LINKAGE,AVERAGE_LINKAGE
* or COMPLETE_LINKAGE
*
* RETURNS :
* NOTES :
* CHANGE HISTROY
* Author Date Description of change
*************************************************************************************/
LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,int noOfClusters,
ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE) :
m_data(clusterObjects),m_method(clusteringMethod),
m_numOfClusters(noOfClusters),m_writeHTML(false),
m_showAllLevels(false),
m_determineClusters(false)
{
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
<<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
if(m_numOfClusters < 1 || m_numOfClusters>=clusterObjects.size())
{
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Number of clusters:"<<m_numOfClusters<<endl;
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Error : "<< EINVALID_NUM_CLUSTERS <<":"<< getErrorMessage(EINVALID_NUM_CLUSTERS)
<<" LTKHierarchicalClustering::"
<<"LTKHierarchicalClustering(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
throw LTKException(EINVALID_NUM_CLUSTERS);
}
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
<<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
}
/**********************************************************************************
* AUTHOR : Bharath A
* DATE : 22-FEB-2005
* NAME : LTKHierarchicalClustering
* DESCRIPTION : Constructor to initialise the required parameters.
* Number of clusters is determined by L-method
* ARGUMENTS : clusterObjects - vector of data objects which could be LKTrace or
* LTKTraceGroup for HWR
* clusteringMethod - One of the 3 methods:
* SINGLE_LINKAGE,AVERAGE_LINKAGE
* or COMPLETE_LINKAGE
* stoppingCriterion - stopping criterion to determine the
* right set of clusters: LMETHOD or AVG_SIL
*
* RETURNS :
* NOTES :
* CHANGE HISTROY
* Author Date Description of change
*************************************************************************************/
LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,
ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE,
ELTKHCStoppingCriterion stoppingCriterion=LMETHOD) :
m_data(clusterObjects),
m_method(clusteringMethod),
m_stoppingCriterion(stoppingCriterion),
m_writeHTML(false),
m_showAllLevels(false),
m_determineClusters(true)
{
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
if(clusterObjects.size()==0)
{
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Number of elements in clusterObjects vector is zero"<<endl;
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Error : "<< ENO_DATA_TO_CLUSTER <<":"<< getErrorMessage(ENO_DATA_TO_CLUSTER)
<<" LTKHierarchicalClustering::LTKHierarchicalClustering"
<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
throw LTKException(ENO_DATA_TO_CLUSTER);
}
if(clusterObjects.size() < 6 && stoppingCriterion == LMETHOD)
{
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Number of elements in clusterObjects vector is:"
<<clusterObjects.size()<<endl;
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Error : "<< EINSUFFICIENT_DATA_FOR_LMETHOD
<<":"<< getErrorMessage(EINSUFFICIENT_DATA_FOR_LMETHOD)
<<" LTKHierarchicalClustering::LTKHierarchicalClustering"
<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
throw LTKException(EINSUFFICIENT_DATA_FOR_LMETHOD);
}
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
}
/**********************************************************************************
* AUTHOR : Bharath A
* DATE : 22-FEB-2005
* NAME : cluster
* DESCRIPTION : Clusters the input data objects. The number of clusters is determined
* based on the stopping criterion or as specified by the user.
* ARGUMENTS : distanceClassPtr - pointer to the class that contains the distance
* function defintion
* distFuncPtr - distance function pointer
* RETURNS : error code
* NOTES :
* CHANGE HISTROY
* Author Date Description of change
*************************************************************************************/
int cluster(DistanceClass* distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)
{
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
<<"LTKHierarchicalClustering::cluster()"<<endl;
m_distancePtr=distFuncPtr;
m_distClassPtr=distanceClassPtr;
//To compute inter-object distances
int errorCode = computeDistances();
if (errorCode != SUCCESS )
{
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Error: LTKHierarchicalClustering::cluster()"<<endl;
LTKReturnError(errorCode)
}
//if the user has specified the number of clusters
if(!m_determineClusters)
{
clusterToFindNumClusters();
}
else
{
m_numOfClusters=1;
//clustering to determine the number of
//clusters vs merging distance curve
clusterToFindNumClusters();
m_determineClusters=false;
if(m_stoppingCriterion==LMETHOD)
{
//Number of clusters determined by L-method
m_numOfClusters=determineNumOfClusters();
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
<<"Number of clusters determined using L-Method"
<<m_numOfClusters<<endl;
}
else if(m_stoppingCriterion==AVG_SIL)
{
//Number of clusters determined by silhouette method
m_numOfClusters=m_numBySil;
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
<<"Number of clusters determined using Average Silhouette method"
<<m_numOfClusters<<endl;
}
//clearing intermediate clusters formed during evaluation
//of the stopping criterion
m_intermediateCG.clear();
//clustering to the number of clusters determined
//by the stopping criterion
clusterToFindNumClusters();
}
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
<<"LTKHierarchicalClustering::cluster()"<<endl;
return SUCCESS;
}
/**********************************************************************************
* AUTHOR : Dinesh M
* DATE : 23-Jan-2006
* NAME : getProximityMatrix
* DESCRIPTION : returns the distance matrix
* ARGUMENTS :
* RETURNS : proximity matrix (float2DVector)
* NOTES :
* CHANGE HISTROY
* Author Date Description of change
*************************************************************************************/
const float2DVector& getProximityMatrix() const
{
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
<<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
<<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;
return m_proximityMatrix;
}
/**********************************************************************************
* AUTHOR : Bharath A
* DATE : 22-FEB-2005
* NAME : setOutputConfig
* DESCRIPTION : This function sets the configuration for the output of clustering.
* ARGUMENTS : outputDirectory - path to the directory where output html
* is to be generated
* displayAllLevels - flag to indicate whether all levels in the
* clustering need to written to the file
* imageFileExtension - extension of the image file (ex)"png".
* If not specified <img> tag in the output html is not created.
* RETURNS : error code
* NOTES :
* CHANGE HISTROY
* Author Date Description of change
*************************************************************************************/
//int setOutputConfig(const string& outputDirectory,
// bool displayAllLevels=false,
// string imageFileExtension="")
// TODO: check if the output directory is valid
int setOutputConfig(const string& outputDirectory,bool displayAllLevels=false,
const string& imageFileExtension="")
{
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
<<"LTKHierarchicalClustering::setOutputConfig()"<<endl;
m_writeHTML=true;
m_showAllLevels=displayAllLevels;
string tempOutputFilePath=outputDirectory+"/"+OUTPUT_HTML_FILE_NAME;
m_output.open(tempOutputFilePath.c_str());
if(m_output.fail())
{
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Unable to create file:"
<<tempOutputFilePath<<endl;
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Error : "<< EFILE_CREATION_FAILED <<":"
<<getErrorMessage(EFILE_CREATION_FAILED)
<<" LTKHierarchicalClustering::setOutputConfig()" <<endl;
LTKReturnError(EFILE_CREATION_FAILED)
}
m_outputDir=outputDirectory;
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
<<"Clustering results output directory:"
<<m_outputDir<<endl;
m_output.close();
//If it takes the default value, <img> tag in the output is not generated
m_imageFileExtn=imageFileExtension;
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
<<"Image file extension:"<<m_imageFileExtn<<endl;
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
<<"LTKHierarchicalClustering::setOutputConfig()"<<endl;
return SUCCESS;
}
/**********************************************************************************
* AUTHOR : Bharath A
* DATE : 22-FEB-2005
* NAME : setHyperlinkMap
* DESCRIPTION : To set hyperlinks for each data object which refers to actual data file.
* Assumes one-to-one correspondence with the data vector
* passed in the constructor.
* ARGUMENTS : hyperlinksVector - Vector containing paths to physical
* files of each data object.
*
* RETURNS : error code
* NOTES :
* CHANGE HISTROY
* Author Date Description of change
*************************************************************************************/
//int setHyperlinkMap(const vector<string>& hyperlinksVector)
int setHyperlinkMap(const vector<string>& hyperlinksVector)
{
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
<<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;
if(m_data.size()!=hyperlinksVector.size())
{
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"cluster objects vector size:"<<m_data.size()
<<" and hyperlinks vector size:"<<hyperlinksVector.size()<<endl;
LOG(LTKLogger::LTK_LOGLEVEL_ERR)
<<"Error : "<< EDATA_HYPERLINK_VEC_SIZE_MISMATCH
<<":"<< getErrorMessage(EDATA_HYPERLINK_VEC_SIZE_MISMATCH)
<<" LTKHierarchicalClustering::setHyperlinkMap()" <<endl;
LTKReturnError(EDATA_HYPERLINK_VEC_SIZE_MISMATCH);
}
m_hyperlinksVec=hyperlinksVector; //Vector for hyperlinks is set
LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
<<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;
return SUCCESS;
}
/**********************************************************************************
* AUTHOR : Bharath A
* DATE : 22-FEB-2005
* NAME : getClusterResult
* DESCRIPTION : Populates the argument (vector of vectors) with data objects indices.
* Each row (inner vector) corresponds to a cluster.
* ARGUMENTS : outClusterResult - reference to result vector of vectors.
*
* RETURNS :
* NOTES :
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -