⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ltkhierarchicalclustering.h

📁 An open source handwriting recongnition package!!!
💻 H
📖 第 1 页 / 共 3 页
字号:
/*****************************************************************************************
* Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
* OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*****************************************************************************************/

/************************************************************************
 * FILE DESCR: Definitions of Agglomerative Hierarchical Clustering module
 *
 * CONTENTS:
 *			cluster
 *			getProximityMatrix
 *			setOutputConfig
 *			setHyperlinkMap
 *			getClusterResult
 *			computeProximityMatrix
 *			computeDistances
 *			clusterToFindNumClusters
 *			getInterObjectDistance
 *			findGroup
 *			findInterClusterDistance
 *			writeClustersAsHTML
 *			determineNumOfClusters
 *			determineKnee
 *			findRMSE
 *			computeAvgSil
 *
 *
 * AUTHOR:     Bharath A
 *
 * DATE:       February 22, 2005
 * CHANGE HISTORY:
 * Author		Date			Description of change
 ************************************************************************/
#ifndef __LTKHIERARCHICALCLUSTERING_H
#define __LTKHIERARCHICALCLUSTERING_H


#ifndef _WIN32
#include <values.h>
#endif

#include "LTKInc.h"
#include "LTKTypes.h"
#include "LTKLoggerUtil.h"
#include "LTKException.h"
#include "LTKErrors.h"

/*Enumerator for stopping criterion to be used*/
enum ELTKHCStoppingCriterion
{
	LMETHOD,
	AVG_SIL
};

/*Enumerator for methods in hierarchical clustering*/
 enum ELTKHCMethod
 {
	SINGLE_LINKAGE,
	COMPLETE_LINKAGE,
	AVERAGE_LINKAGE
 };

#define OUTPUT_HTML_FILE_NAME "output.html"
#define MIN_CUTOFF 20

/**
 * @class LTKHierarchicalClustering
 * <p> This class does agglomerative hierarchical clustering. The data objects
	   which could be LTKTrace or LTKTraceGroup, are supplied as a vector.
	   Function that defines the distance between two data objects needs to be
       supplied as a function pointer.One of the 3 methods (Single,Average or
       Complete linkage) needs to be selected to define the way inter-cluster
       has to be determined. In case number of clusters is not supplied,
       it is determined using the L-method (default stopping criterion)<p> */

template <class ClusterObjType,class DistanceClass>
class LTKHierarchicalClustering
{

	private:

		//reference to the vector containing the data objects to be clustered
	    const vector<ClusterObjType>& m_data;

		//triangular matrix containing the pairwise distances between data
		//objects
  		float2DVector m_proximityMatrix;

		//data structure that stores current (intermediate) state of the
		//clusters
  		int2DVector m_intermediateCG;

		//vector mapping the data object id and path to the data (unipen) file
		stringVector m_hyperlinksVec;

		//contains the number of clusters required
  		int m_numOfClusters;

		//output file handle to write the cluster results as html
		//with name of the file as OUTPUT_HTML_FILE_NAME
		ofstream m_output;

		//flag to indicate whether the output is required as html
		bool m_writeHTML;

		//flag to indicate whether to show all levels of the hierarchy in the
		//html file
		bool m_showAllLevels;

		//vector to hold merging distance for each number of clusters
		floatVector m_mergingDist;

		//flag for determining number of clusters
		bool m_determineClusters;

		//output result directory
		string m_outputDir;

		//extension of the image corresponding to each data object in
		//order to write in the html
		string m_imageFileExtn;

		//Method for defining inter-cluster distance
		ELTKHCMethod m_method;

		 //number of clusters determined by Average Silhouette method
		int m_numBySil;

		//cached clustering result corresponding minimum average silhouette
		int2DVector m_cachedResult;

		//stopping criterion selected - LMethod or Average Silhouette
		ELTKHCStoppingCriterion m_stoppingCriterion;

		//pointer to the class that has the definition of distance function
		DistanceClass* m_distClassPtr;

		//function pointer type of the function that defines inter-object distance
		typedef int (DistanceClass::*FN_PTR_DISTANCE)(const ClusterObjType&,
                                                      const ClusterObjType&,
                                                      float&) ;

		//distance function pointer
		FN_PTR_DISTANCE m_distancePtr;


	public:

/**********************************************************************************
* AUTHOR		: Bharath A
* DATE			: 22-FEB-2005
* NAME			: LTKHierarchicalClustering
* DESCRIPTION	: Constructor to initialise the required parameters
* ARGUMENTS		: clusterObjects - vector of data objects which could be LKTrace or
*									LTKTraceGroup for HWR
*				  noOfClusters - Number of clusters required
*				  clusteringMethod - One of the 3 methods:
*									 SINGLE_LINKAGE,AVERAGE_LINKAGE
*									 or COMPLETE_LINKAGE
*
* RETURNS		:
* NOTES			:
* CHANGE HISTROY
* Author			Date				Description of change
*************************************************************************************/


LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,int noOfClusters,
						  ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE) :
						  m_data(clusterObjects),m_method(clusteringMethod),
						  m_numOfClusters(noOfClusters),m_writeHTML(false),
						  m_showAllLevels(false),
						  m_determineClusters(false)
{

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
		<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
		<<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;

	if(m_numOfClusters < 1 || m_numOfClusters>=clusterObjects.size())
	{
		 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			 <<"Number of clusters:"<<m_numOfClusters<<endl;

		 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Error : "<< EINVALID_NUM_CLUSTERS <<":"<< getErrorMessage(EINVALID_NUM_CLUSTERS)
            <<" LTKHierarchicalClustering::"
			<<"LTKHierarchicalClustering(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;

		throw LTKException(EINVALID_NUM_CLUSTERS);
	}

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
		<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
		<<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;

}


/**********************************************************************************
* AUTHOR		: Bharath A
* DATE			: 22-FEB-2005
* NAME			: LTKHierarchicalClustering
* DESCRIPTION	: Constructor to initialise the required parameters.
*                 Number of clusters is determined by L-method
* ARGUMENTS		: clusterObjects - vector of data objects which could be LKTrace or
*									LTKTraceGroup for HWR
*				  clusteringMethod - One of the 3 methods:
*									 SINGLE_LINKAGE,AVERAGE_LINKAGE
*									 or COMPLETE_LINKAGE
*				  stoppingCriterion - stopping criterion to determine the
*									  right set of clusters: LMETHOD or AVG_SIL
*
* RETURNS		:
* NOTES			:
* CHANGE HISTROY
* Author			Date				Description of change
*************************************************************************************/


LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,
						  ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE,
						  ELTKHCStoppingCriterion stoppingCriterion=LMETHOD) :
						  m_data(clusterObjects),
						  m_method(clusteringMethod),
						  m_stoppingCriterion(stoppingCriterion),
						  m_writeHTML(false),
						  m_showAllLevels(false),
						  m_determineClusters(true)
{

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
		<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
		<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;

	if(clusterObjects.size()==0)
	{
		LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Number of elements in clusterObjects vector is zero"<<endl;

		 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Error : "<< ENO_DATA_TO_CLUSTER <<":"<< getErrorMessage(ENO_DATA_TO_CLUSTER)
            <<" LTKHierarchicalClustering::LTKHierarchicalClustering"
			<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;


		throw LTKException(ENO_DATA_TO_CLUSTER);
	}

	if(clusterObjects.size() < 6 && stoppingCriterion == LMETHOD)
	{
		LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Number of elements in clusterObjects vector is:"
			<<clusterObjects.size()<<endl;

		 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Error : "<< EINSUFFICIENT_DATA_FOR_LMETHOD
			<<":"<< getErrorMessage(EINSUFFICIENT_DATA_FOR_LMETHOD)
            <<" LTKHierarchicalClustering::LTKHierarchicalClustering"
			<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;

		throw LTKException(EINSUFFICIENT_DATA_FOR_LMETHOD);
	}

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
		<<"LTKHierarchicalClustering::LTKHierarchicalClustering"
		<<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;

}



/**********************************************************************************
* AUTHOR		: Bharath A
* DATE			: 22-FEB-2005
* NAME			: cluster
* DESCRIPTION	: Clusters the input data objects. The number of clusters is determined
*				  based on the stopping criterion or as specified by the user.
* ARGUMENTS		: distanceClassPtr - pointer to the class that contains the distance
*									 function defintion
*				  distFuncPtr - distance function pointer
* RETURNS		: error code
* NOTES			:
* CHANGE HISTROY
* Author			Date				Description of change
*************************************************************************************/
int cluster(DistanceClass* distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)
{

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
		<<"LTKHierarchicalClustering::cluster()"<<endl;


	m_distancePtr=distFuncPtr;

	m_distClassPtr=distanceClassPtr;


	//To compute inter-object distances
	int errorCode = computeDistances();

	if (errorCode != SUCCESS )
	{
		LOG(LTKLogger::LTK_LOGLEVEL_ERR)
        <<"Error: LTKHierarchicalClustering::cluster()"<<endl;

		LTKReturnError(errorCode)
	}

	//if the user has specified the number of clusters
	if(!m_determineClusters)
	{
		 clusterToFindNumClusters();
	}
	else
	{
			m_numOfClusters=1;

			//clustering to determine the number of
			//clusters vs merging distance curve
			clusterToFindNumClusters();

			m_determineClusters=false;

			if(m_stoppingCriterion==LMETHOD)
			{
				//Number of clusters determined by L-method
			    m_numOfClusters=determineNumOfClusters();

				LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
					<<"Number of clusters determined using L-Method"
					<<m_numOfClusters<<endl;

			}
			else if(m_stoppingCriterion==AVG_SIL)
			{

				//Number of clusters determined by silhouette method
			    m_numOfClusters=m_numBySil;

				LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
					<<"Number of clusters determined using Average Silhouette method"
					<<m_numOfClusters<<endl;

			}

			//clearing intermediate clusters formed during evaluation
			//of the stopping criterion
			m_intermediateCG.clear();

			//clustering to the number of clusters determined
			//by the stopping criterion
			clusterToFindNumClusters();

	  }


	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
		<<"LTKHierarchicalClustering::cluster()"<<endl;
	 return SUCCESS;

}



/**********************************************************************************
* AUTHOR		: Dinesh M
* DATE			: 23-Jan-2006
* NAME			: getProximityMatrix
* DESCRIPTION	: returns the distance matrix
* ARGUMENTS		:
* RETURNS		: proximity matrix (float2DVector)
* NOTES			:
* CHANGE HISTROY
* Author			Date				Description of change
*************************************************************************************/

const float2DVector& getProximityMatrix() const
{

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
		<<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;


	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
		<<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;

	return m_proximityMatrix;
}


/**********************************************************************************
* AUTHOR		: Bharath A
* DATE			: 22-FEB-2005
* NAME			: setOutputConfig
* DESCRIPTION	: This function sets the configuration for the output of clustering.
* ARGUMENTS		: outputDirectory - path to the directory where output html
*									is to be generated
*				  displayAllLevels - flag to indicate whether all levels in the
*							 		 clustering need to written to the file
*				  imageFileExtension - extension of the image file (ex)"png".
*							          If not specified <img> tag in the output html is not created.
* RETURNS		: error code
* NOTES			:
* CHANGE HISTROY
* Author			Date				Description of change
*************************************************************************************/
//int setOutputConfig(const string& outputDirectory,
//                              bool displayAllLevels=false,
//                              string imageFileExtension="")
// TODO: check if the output directory is valid
int setOutputConfig(const string& outputDirectory,bool displayAllLevels=false,
					const string& imageFileExtension="")
{

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
		<<"LTKHierarchicalClustering::setOutputConfig()"<<endl;

	m_writeHTML=true;

	m_showAllLevels=displayAllLevels;

	string tempOutputFilePath=outputDirectory+"/"+OUTPUT_HTML_FILE_NAME;

	m_output.open(tempOutputFilePath.c_str());

	if(m_output.fail())
	{

		LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Unable to create file:"
			<<tempOutputFilePath<<endl;

		LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Error : "<< EFILE_CREATION_FAILED <<":"
			<<getErrorMessage(EFILE_CREATION_FAILED)
            <<" LTKHierarchicalClustering::setOutputConfig()" <<endl;

		LTKReturnError(EFILE_CREATION_FAILED)
	}

	m_outputDir=outputDirectory;

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
		<<"Clustering results output directory:"
		<<m_outputDir<<endl;

	m_output.close();

	//If it takes the default value, <img> tag in the output is not generated
	m_imageFileExtn=imageFileExtension;

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
		<<"Image file extension:"<<m_imageFileExtn<<endl;

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
		<<"LTKHierarchicalClustering::setOutputConfig()"<<endl;

	return SUCCESS;
}


/**********************************************************************************
* AUTHOR		: Bharath A
* DATE			: 22-FEB-2005
* NAME			: setHyperlinkMap
* DESCRIPTION	: To set hyperlinks for each data object which refers to actual data file.
*				  Assumes one-to-one correspondence with the data vector
*				  passed in the constructor.
* ARGUMENTS		: hyperlinksVector - Vector containing paths to physical
*									 files of each data object.
*
* RETURNS		: error code
* NOTES			:
* CHANGE HISTROY
* Author			Date				Description of change
*************************************************************************************/
//int setHyperlinkMap(const vector<string>& hyperlinksVector)
int setHyperlinkMap(const vector<string>& hyperlinksVector)
{

	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
		<<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;

	if(m_data.size()!=hyperlinksVector.size())
	{
		LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"cluster objects vector size:"<<m_data.size()
			<<" and hyperlinks vector size:"<<hyperlinksVector.size()<<endl;

		LOG(LTKLogger::LTK_LOGLEVEL_ERR)
			<<"Error : "<< EDATA_HYPERLINK_VEC_SIZE_MISMATCH
			<<":"<< getErrorMessage(EDATA_HYPERLINK_VEC_SIZE_MISMATCH)
            <<" LTKHierarchicalClustering::setHyperlinkMap()" <<endl;

		LTKReturnError(EDATA_HYPERLINK_VEC_SIZE_MISMATCH);
	}

	m_hyperlinksVec=hyperlinksVector; //Vector for hyperlinks is set


	LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
		<<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;

	return SUCCESS;

}


/**********************************************************************************
* AUTHOR		: Bharath A
* DATE			: 22-FEB-2005
* NAME			: getClusterResult
* DESCRIPTION	: Populates the argument (vector of vectors) with data objects indices.
*				  Each row (inner vector) corresponds to a cluster.
* ARGUMENTS		: outClusterResult - reference to result vector of vectors.
*
* RETURNS		:
* NOTES			:

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -