gcluster.cpp
来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 401 行
CPP
401 行
/* Copyright (C) 2006, Mike Gashler This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. see http://www.gnu.org/copyleft/lesser.html*/#include "GCluster.h"#include "GArray.h"#include "GArff.h"#include "GKNN.h"#include "GBitTable.h"#include "GPointerQueue.h"#include "GMath.h"#include "GVector.h"#include <math.h>#include <stdlib.h>GMergeClusterer::GMergeClusterer(GArffRelation* pRelation, GArffData* pData, int nClusters){ m_pRelation = pRelation; m_pData = pData; m_nClusters = nClusters; m_pClusters = new int[pData->GetSize()]; int i; for(i = 0; i < pData->GetSize(); i++) m_pClusters[i] = i; int nClusterCount = pData->GetSize(); int a, b, max, min; while(nClusterCount > nClusters) { if(!FindBestMatch(&a, &b)) { GAssert(false, "couldn't find any candidates to merge"); break; } min = MIN(a, b); max = MAX(a, b); ChangeCluster(max, min); if(max != nClusterCount - 1) ChangeCluster(nClusterCount - 1, max); nClusterCount--; }}GMergeClusterer::~GMergeClusterer(){ delete[] m_pClusters;}void GMergeClusterer::ChangeCluster(int nFrom, int nTo){ int i; for(i = 0; i < m_pData->GetSize(); i++) { if(m_pClusters[i] == nFrom) m_pClusters[i] = nTo; }}bool GMergeClusterer::FindBestMatch(int* pA, int* pB){ double dBestDist = 1e200; double d; double* pVecA; double* pVecB; int i, j; for(i = 0; i < m_pData->GetSize(); i++) { pVecA = m_pData->GetVector(i); for(j = i + 1; j < m_pData->GetSize(); j++) { if(m_pClusters[i] == m_pClusters[j]) continue; pVecB = m_pData->GetVector(j); d = m_pRelation->ComputeInputDistanceSquared(pVecA, pVecB); if(d < dBestDist) { dBestDist = d; *pA = m_pClusters[i]; *pB = m_pClusters[j]; } } } return (dBestDist < 1e200);}int GMergeClusterer::GetCluster(int nVector){ GAssert(nVector >= 0 && nVector < m_pData->GetSize(), "out of range"); return m_pClusters[nVector];}// -----------------------------------------------------------------------------------------GKMeans::GKMeans(GArffRelation* pRelation, GArffData* pData, int nClusters){ if(pData->GetSize() < nClusters) throw "The number of clusters must be less than the number of data points"; m_pRelation = pRelation; m_pData = pData; m_nClusters = nClusters; m_pClusters = new int[pData->GetSize()]; memset(m_pClusters, 0xff, sizeof(int) * pData->GetSize()); int i; for(i = 0; i < 5; i++) { if(Cluster(pRelation->GetInputCount() * pData->GetSize())) break; } GAssert(i < 5, "never fully converged");}GKMeans::~GKMeans(){ delete[] m_pClusters;}bool GKMeans::SelectSeeds(GArffData* pSeeds){ // Randomly select the seed points int i, j, k, index; double* pVector; for(i = 0; i < m_nClusters; i++) { for(j = 0; j < m_nClusters; j++) { // Pick a point index = rand() % m_pData->GetSize(); pVector = m_pData->GetVector(index); // Make sure we didn't pick the same point already bool bOK = true; for(k = 0; k < i; k++) { if(m_pRelation->ComputeInputDistanceSquared(pVector, pSeeds->GetVector(k)) <= 0) { bOK = false; break; } } // Accept this seed point if(bOK) { pSeeds->CopyVector(pVector, m_pRelation->GetAttributeCount()); break; } } if(j >= m_nClusters) return false; // failed to find "m_nClusters" unique seed points } return true;}bool GKMeans::Cluster(int nMaxIterations){ // Pick the seeds GArffData means(m_nClusters); if(!SelectSeeds(&means)) return false; GAssert(means.GetSize() == m_nClusters, "Got the wrong number of seed points"); // Do the clustering GTEMPBUF(int, pCounts, means.GetSize()); int i, j, k, nCluster; double d, dBestDist; double* pVector; bool bChanged; for(i = 0; i < nMaxIterations; i++) { // Assign new cluster to each point bChanged = false; for(j = 0; j < m_pData->GetSize(); j++) { pVector = m_pData->GetVector(j); dBestDist = 1e200; nCluster = -1; for(k = 0; k < m_nClusters; k++) { d = m_pRelation->ComputeInputDistanceSquared(pVector, means.GetVector(k)); if(d < dBestDist) { dBestDist = d; nCluster = k; } } if(m_pClusters[j] != nCluster) bChanged = true; m_pClusters[j] = nCluster; } if(!bChanged) break; // Update the seeds for(j = 0; j < means.GetSize(); j++) GVector::SetToZero(means.GetVector(j), m_pRelation->GetAttributeCount()); memset(pCounts, '\0', sizeof(int) * means.GetSize()); for(j = 0; j < m_pData->GetSize(); j++) { GVector::Add(means.GetVector(m_pClusters[j]), m_pData->GetVector(j), m_pRelation->GetAttributeCount()); pCounts[m_pClusters[j]]++; } for(j = 0; j < means.GetSize(); j++) GVector::Multiply(means.GetVector(j), 1.0 / pCounts[j], m_pRelation->GetAttributeCount()); } return (i < nMaxIterations);}int GKMeans::GetCluster(int nVector){ GAssert(nVector >= 0 && nVector < m_pData->GetSize(), "out of range"); return m_pClusters[nVector];}// -----------------------------------------------------------------------------------------GNeighborClusterer::GNeighborClusterer(int nVectorCount){ m_pClusters = new GIntArray(8); m_pVectors = new GIntArray(nVectorCount);}GNeighborClusterer::~GNeighborClusterer(){ delete(m_pClusters); delete(m_pVectors);}// staticGNeighborClusterer* GNeighborClusterer::Cluster(GArffRelation* pRelation, GArffData* pData, int nNeighbors){ GNeighborFinder neighborFinder(pRelation, pData, MAX(8, nNeighbors)); int nVectors = pData->GetSize(); GNeighborClusterer* pClusterer = new GNeighborClusterer(nVectors); GBitTable bits(nVectors); GTEMPBUF(int, pNeighbors, nNeighbors); GTEMPBUF(double, pDistances, nNeighbors); int nFirstAvailable = 0; int i, j; double* pVector; while(true) { while(nFirstAvailable < nVectors && bits.GetBit(nFirstAvailable)) nFirstAvailable++; if(nFirstAvailable >= nVectors) break; pClusterer->m_pClusters->AddInt(pClusterer->m_pVectors->GetSize()); GIntQueue q; q.Push(nFirstAvailable); while(q.GetSize() > 0) { i = q.Pop(); if(bits.GetBit(i)) continue; pClusterer->m_pVectors->AddInt(i); bits.SetBit(i, true); pVector = pData->GetVector(i); neighborFinder.FindNeighbors(pNeighbors, pDistances, nNeighbors, pVector, i); for(j = 0; j < nNeighbors; j++) q.Push(pNeighbors[j]); } } return pClusterer;}int GNeighborClusterer::GetClusterCount(){ return m_pClusters->GetSize();}int GNeighborClusterer::GetClusterSize(int nCluster){ if(nCluster < m_pClusters->GetSize() - 1) return m_pClusters->GetInt(nCluster + 1) - m_pClusters->GetInt(nCluster); else return m_pVectors->GetSize() - m_pClusters->GetInt(nCluster);}int GNeighborClusterer::GetVectorIndex(int nCluster, int nVector){ return m_pVectors->GetInt(m_pClusters->GetInt(nCluster) + nVector);}#ifndef NO_TEST_CODE//staticvoid GNeighborClusterer::Test(){ // Make a 3D data set with 3 entwined spirals GArffRelation rel; rel.AddAttribute(new GArffAttribute(true, 0, NULL)); rel.AddAttribute(new GArffAttribute(true, 0, NULL)); rel.AddAttribute(new GArffAttribute(true, 0, NULL)); GArffData data(3000); double dThirdCircle = PI * 2 / 3; double* pVector; int i; for(i = 0; i < 1000; i++) { pVector = new double[3]; pVector[0] = cos((double)i * 4 * PI / 1000); pVector[1] = sin((double)i * 4 * PI / 1000); pVector[2] = (double)i / 1000; data.AddVector(pVector); pVector = new double[3]; pVector[0] = cos((double)i * 4 * PI / 1000 + dThirdCircle); pVector[1] = sin((double)i * 4 * PI / 1000 + dThirdCircle); pVector[2] = (double)i / 1000; data.AddVector(pVector); pVector = new double[3]; pVector[0] = cos((double)i * 4 * PI / 1000 + dThirdCircle + dThirdCircle); pVector[1] = sin((double)i * 4 * PI / 1000 + dThirdCircle + dThirdCircle); pVector[2] = (double)i / 1000; data.AddVector(pVector); } // Cluster them GNeighborClusterer* pClusterer = GNeighborClusterer::Cluster(&rel, &data, 10); Holder<GNeighborClusterer*> hClusterer(pClusterer); // Check the answers if(pClusterer->GetClusterCount() != 3) throw "wrong number of clusters"; int nClusterSize = pClusterer->GetClusterSize(0); if(pClusterer->GetClusterSize(1) != nClusterSize) throw "clusters different sizes"; if(pClusterer->GetClusterSize(2) != nClusterSize) throw "clusters different sizes"; for(i = 0; i < nClusterSize; i++) { if(pClusterer->GetVectorIndex(0, i) % 3 != 0) throw "vector in wrong cluster"; if(pClusterer->GetVectorIndex(1, i) % 3 != 1) throw "vector in wrong cluster"; if(pClusterer->GetVectorIndex(2, i) % 3 != 2) throw "vector in wrong cluster"; }}#endif // NO_TEST_CODE// -----------------------------------------------------------------------------------------/*void BlurVector(int nDims, double* pInput, double* pOutput, double dAmount){ double dWeight, dSumWeight; int i, j; for(i = 0; i < nDims; i++) { pOutput[i] = 0; dSumWeight = 0; for(j = 0; j < nDims; j++) { dWeight = GMath::gaussian((double)(j - i) / dAmount); dSumWeight += dWeight; pOutput[i] += dWeight * pInput[j]; } pOutput[i] /= dSumWeight; }}*//*void MakeGaussianHistogram(int nElements, double* pInput, double* pOutput, double dBlurAmount){ int i, j; for(i = 0; i < nElements; i++) { pOutput[i] = 0; for(j = 0; j < nElements; j++) pOutput[i] += GMath::gaussian((pOutput[j] - pOutput[i]) / dBlurAmount); }}int CountLocalMaximums(int nElements, double* pData){ if(nElements < 2) return nElements; int nCount = 0; if(pData[0] > pData[1]) nCount++; int i; nElements--; for(i = 1; i < nElements; i++) { if(pData[i] > pData[i - 1] && pData[i] > pData[i + 1]) nCount++; } if(pData[nElements] > pData[nElements - 1]) nCount++; return nCount;}*/
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?