gcluster.cpp

来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 401 行

CPP
401
字号
/*	Copyright (C) 2006, Mike Gashler	This library is free software; you can redistribute it and/or	modify it under the terms of the GNU Lesser General Public	License as published by the Free Software Foundation; either	version 2.1 of the License, or (at your option) any later version.	see http://www.gnu.org/copyleft/lesser.html*/#include "GCluster.h"#include "GArray.h"#include "GArff.h"#include "GKNN.h"#include "GBitTable.h"#include "GPointerQueue.h"#include "GMath.h"#include "GVector.h"#include <math.h>#include <stdlib.h>GMergeClusterer::GMergeClusterer(GArffRelation* pRelation, GArffData* pData, int nClusters){	m_pRelation = pRelation;	m_pData = pData;	m_nClusters = nClusters;	m_pClusters = new int[pData->GetSize()];	int i;	for(i = 0; i < pData->GetSize(); i++)		m_pClusters[i] = i;	int nClusterCount = pData->GetSize();	int a, b, max, min;	while(nClusterCount > nClusters)	{		if(!FindBestMatch(&a, &b))		{			GAssert(false, "couldn't find any candidates to merge");			break;		}		min = MIN(a, b);		max = MAX(a, b);		ChangeCluster(max, min);		if(max != nClusterCount - 1)			ChangeCluster(nClusterCount - 1, max);		nClusterCount--;	}}GMergeClusterer::~GMergeClusterer(){	delete[] m_pClusters;}void GMergeClusterer::ChangeCluster(int nFrom, int nTo){	int i;	for(i = 0; i < m_pData->GetSize(); i++)	{		if(m_pClusters[i] == nFrom)			m_pClusters[i] = nTo;	}}bool GMergeClusterer::FindBestMatch(int* pA, int* pB){	double dBestDist = 1e200;	double d;	double* pVecA;	double* pVecB;	int i, j;	for(i = 0; i < m_pData->GetSize(); i++)	{		pVecA = m_pData->GetVector(i);		for(j = i + 1; j < m_pData->GetSize(); j++)		{			if(m_pClusters[i] == m_pClusters[j])				continue;			pVecB = m_pData->GetVector(j);			d = m_pRelation->ComputeInputDistanceSquared(pVecA, pVecB);			if(d < dBestDist)			{				dBestDist = d;				*pA = m_pClusters[i];				*pB = m_pClusters[j];			}		}	}	return (dBestDist < 1e200);}int GMergeClusterer::GetCluster(int nVector){	GAssert(nVector >= 0 && nVector < m_pData->GetSize(), "out of range");	return m_pClusters[nVector];}// -----------------------------------------------------------------------------------------GKMeans::GKMeans(GArffRelation* pRelation, GArffData* pData, int nClusters){	if(pData->GetSize() < nClusters)		throw "The number of clusters must be less than the number of data points";	m_pRelation = pRelation;	m_pData = pData;	m_nClusters = nClusters;	m_pClusters = new int[pData->GetSize()];	memset(m_pClusters, 0xff, sizeof(int) * pData->GetSize());	int i;	for(i = 0; i < 5; i++)	{		if(Cluster(pRelation->GetInputCount() * pData->GetSize()))			break;	}	GAssert(i < 5, "never fully converged");}GKMeans::~GKMeans(){	delete[] m_pClusters;}bool GKMeans::SelectSeeds(GArffData* pSeeds){	// Randomly select the seed points	int i, j, k, index;	double* pVector;	for(i = 0; i < m_nClusters; i++)	{		for(j = 0; j < m_nClusters; j++)		{			// Pick a point			index = rand() % m_pData->GetSize();			pVector = m_pData->GetVector(index);			// Make sure we didn't pick the same point already			bool bOK = true;			for(k = 0; k < i; k++)			{				if(m_pRelation->ComputeInputDistanceSquared(pVector, pSeeds->GetVector(k)) <= 0)				{					bOK = false;					break;				}			}			// Accept this seed point			if(bOK)			{				pSeeds->CopyVector(pVector, m_pRelation->GetAttributeCount());				break;			}		}		if(j >= m_nClusters)			return false; // failed to find "m_nClusters" unique seed points	}	return true;}bool GKMeans::Cluster(int nMaxIterations){	// Pick the seeds	GArffData means(m_nClusters);	if(!SelectSeeds(&means))		return false;	GAssert(means.GetSize() == m_nClusters, "Got the wrong number of seed points");	// Do the clustering	GTEMPBUF(int, pCounts, means.GetSize());	int i, j, k, nCluster;	double d, dBestDist;	double* pVector;	bool bChanged;	for(i = 0; i < nMaxIterations; i++)	{		// Assign new cluster to each point		bChanged = false;		for(j = 0; j < m_pData->GetSize(); j++)		{			pVector = m_pData->GetVector(j);			dBestDist = 1e200;			nCluster = -1;			for(k = 0; k < m_nClusters; k++)			{				d = m_pRelation->ComputeInputDistanceSquared(pVector, means.GetVector(k));				if(d < dBestDist)				{					dBestDist = d;					nCluster = k;				}			}			if(m_pClusters[j] != nCluster)				bChanged = true;			m_pClusters[j] = nCluster;		}		if(!bChanged)			break;		// Update the seeds		for(j = 0; j < means.GetSize(); j++)			GVector::SetToZero(means.GetVector(j), m_pRelation->GetAttributeCount());		memset(pCounts, '\0', sizeof(int) * means.GetSize());		for(j = 0; j < m_pData->GetSize(); j++)		{			GVector::Add(means.GetVector(m_pClusters[j]), m_pData->GetVector(j), m_pRelation->GetAttributeCount());			pCounts[m_pClusters[j]]++;		}		for(j = 0; j < means.GetSize(); j++)			GVector::Multiply(means.GetVector(j), 1.0 / pCounts[j], m_pRelation->GetAttributeCount());	}	return (i < nMaxIterations);}int GKMeans::GetCluster(int nVector){	GAssert(nVector >= 0 && nVector < m_pData->GetSize(), "out of range");	return m_pClusters[nVector];}// -----------------------------------------------------------------------------------------GNeighborClusterer::GNeighborClusterer(int nVectorCount){	m_pClusters = new GIntArray(8);	m_pVectors = new GIntArray(nVectorCount);}GNeighborClusterer::~GNeighborClusterer(){	delete(m_pClusters);	delete(m_pVectors);}// staticGNeighborClusterer* GNeighborClusterer::Cluster(GArffRelation* pRelation, GArffData* pData, int nNeighbors){	GNeighborFinder neighborFinder(pRelation, pData, MAX(8, nNeighbors));	int nVectors = pData->GetSize();	GNeighborClusterer* pClusterer = new GNeighborClusterer(nVectors);	GBitTable bits(nVectors);	GTEMPBUF(int, pNeighbors, nNeighbors);	GTEMPBUF(double, pDistances, nNeighbors);	int nFirstAvailable = 0;	int i, j;	double* pVector;	while(true)	{		while(nFirstAvailable < nVectors && bits.GetBit(nFirstAvailable))			nFirstAvailable++;		if(nFirstAvailable >= nVectors)			break;		pClusterer->m_pClusters->AddInt(pClusterer->m_pVectors->GetSize());		GIntQueue q;		q.Push(nFirstAvailable);		while(q.GetSize() > 0)		{			i = q.Pop();			if(bits.GetBit(i))				continue;			pClusterer->m_pVectors->AddInt(i);			bits.SetBit(i, true);			pVector = pData->GetVector(i);			neighborFinder.FindNeighbors(pNeighbors, pDistances, nNeighbors, pVector, i);			for(j = 0; j < nNeighbors; j++)				q.Push(pNeighbors[j]);		}	}	return pClusterer;}int GNeighborClusterer::GetClusterCount(){	return m_pClusters->GetSize();}int GNeighborClusterer::GetClusterSize(int nCluster){	if(nCluster < m_pClusters->GetSize() - 1)		return m_pClusters->GetInt(nCluster + 1) - m_pClusters->GetInt(nCluster);	else		return m_pVectors->GetSize() - m_pClusters->GetInt(nCluster);}int GNeighborClusterer::GetVectorIndex(int nCluster, int nVector){	return m_pVectors->GetInt(m_pClusters->GetInt(nCluster) + nVector);}#ifndef NO_TEST_CODE//staticvoid GNeighborClusterer::Test(){	// Make a 3D data set with 3 entwined spirals	GArffRelation rel;	rel.AddAttribute(new GArffAttribute(true, 0, NULL));	rel.AddAttribute(new GArffAttribute(true, 0, NULL));	rel.AddAttribute(new GArffAttribute(true, 0, NULL));	GArffData data(3000);	double dThirdCircle = PI * 2 / 3;	double* pVector;	int i;	for(i = 0; i < 1000; i++)	{		pVector = new double[3];		pVector[0] = cos((double)i * 4 * PI / 1000);		pVector[1] = sin((double)i * 4 * PI / 1000);		pVector[2] = (double)i / 1000;		data.AddVector(pVector);		pVector = new double[3];		pVector[0] = cos((double)i * 4 * PI / 1000 + dThirdCircle);		pVector[1] = sin((double)i * 4 * PI / 1000 + dThirdCircle);		pVector[2] = (double)i / 1000;		data.AddVector(pVector);		pVector = new double[3];		pVector[0] = cos((double)i * 4 * PI / 1000 + dThirdCircle + dThirdCircle);		pVector[1] = sin((double)i * 4 * PI / 1000 + dThirdCircle + dThirdCircle);		pVector[2] = (double)i / 1000;		data.AddVector(pVector);	}	// Cluster them	GNeighborClusterer* pClusterer = GNeighborClusterer::Cluster(&rel, &data, 10);	Holder<GNeighborClusterer*> hClusterer(pClusterer);	// Check the answers	if(pClusterer->GetClusterCount() != 3)		throw "wrong number of clusters";	int nClusterSize = pClusterer->GetClusterSize(0);	if(pClusterer->GetClusterSize(1) != nClusterSize)		throw "clusters different sizes";	if(pClusterer->GetClusterSize(2) != nClusterSize)		throw "clusters different sizes";	for(i = 0; i < nClusterSize; i++)	{		if(pClusterer->GetVectorIndex(0, i) % 3 != 0)			throw "vector in wrong cluster";		if(pClusterer->GetVectorIndex(1, i) % 3 != 1)			throw "vector in wrong cluster";		if(pClusterer->GetVectorIndex(2, i) % 3 != 2)			throw "vector in wrong cluster";	}}#endif // NO_TEST_CODE// -----------------------------------------------------------------------------------------/*void BlurVector(int nDims, double* pInput, double* pOutput, double dAmount){	double dWeight, dSumWeight;	int i, j;	for(i = 0; i < nDims; i++)	{		pOutput[i] = 0;		dSumWeight = 0;		for(j = 0; j < nDims; j++)		{			dWeight = GMath::gaussian((double)(j - i) / dAmount);			dSumWeight += dWeight;			pOutput[i] += dWeight * pInput[j];		}		pOutput[i] /= dSumWeight;	}}*//*void MakeGaussianHistogram(int nElements, double* pInput, double* pOutput, double dBlurAmount){	int i, j;	for(i = 0; i < nElements; i++)	{		pOutput[i] = 0;		for(j = 0; j < nElements; j++)			pOutput[i] += GMath::gaussian((pOutput[j] - pOutput[i]) / dBlurAmount);	}}int CountLocalMaximums(int nElements, double* pData){	if(nElements < 2)		return nElements;	int nCount = 0;	if(pData[0] > pData[1])		nCount++;	int i;	nElements--;	for(i = 1; i < nElements; i++)	{		if(pData[i] > pData[i - 1] && pData[i] > pData[i + 1])			nCount++;	}	if(pData[nElements] > pData[nElements - 1])		nCount++;	return nCount;}*/

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?