📄 gmanifold.cpp
字号:
#include "GManifold.h"#include <stdio.h>#include "GArray.h"#include <math.h>#include "GPointerQueue.h"#include "GArff.h"#include "GMatrix.h"#include "GVector.h"#include "GBits.h"#include "GKNN.h"struct GManifoldSculptingNeighbor{ unsigned char* m_pNeighbor; unsigned char* m_pNeighborsNeighbor; double m_dCosTheta; double m_dDistance;};struct GManifoldSculptingMetaData{ int m_nCycle; bool m_bAdjustable;};GManifoldSculpting::GManifoldSculpting(int nDataPoints, int nDimensions, int nNeighbors){ m_nDataPoints = nDataPoints; m_nDimensions = nDimensions; m_nNeighbors = nNeighbors; m_nDataIndex = sizeof(struct GManifoldSculptingNeighbor) * m_nNeighbors; m_nValueIndex = m_nDataIndex + sizeof(struct GManifoldSculptingMetaData); m_nRecordSize = m_nValueIndex + sizeof(double) * m_nDimensions; m_pData = new unsigned char[m_nRecordSize * nDataPoints]; m_dSquishingRate = .99; m_nTargetDimensions = 0; m_nPass = 0; m_dAveNeighborDist = 0; m_pQ = NULL; m_nSmoothingAdvantage = 10;}GManifoldSculpting::~GManifoldSculpting(){ delete(m_pData); delete(m_pQ);}void GManifoldSculpting::SetDataPoint(int n, double* pValues, bool bAdjustable){ memcpy(&m_pData[n * m_nRecordSize + m_nValueIndex], pValues, sizeof(double) * m_nDimensions); struct GManifoldSculptingMetaData* pData = GetMetaData(n); pData->m_bAdjustable = bAdjustable;}void GManifoldSculpting::SetData(GArffRelation* pRelation, GArffData* pData){ GAssert(pData->GetSize() == m_nDataPoints, "wrong number of data points"); int nInputs = pRelation->GetInputCount(); GTEMPBUF(double, pInputs, nInputs); double* pRow; int n, i; for(n = 0; n < m_nDataPoints; n++) { pRow = pData->GetVector(n); for(i = 0; i < nInputs; i++) pInputs[i] = pRow[pRelation->GetInputIndex(i)]; SetDataPoint(n, pInputs, true); }}double* GManifoldSculpting::GetDataPoint(int n){ return (double*)&m_pData[n * m_nRecordSize + m_nValueIndex];}int GManifoldSculpting::DataPointSortCompare(unsigned char* pA, unsigned char* pB){ pA += m_nValueIndex; pB += m_nValueIndex; double dA = ((double*)pA)[m_nCurrentDimension]; double dB = ((double*)pB)[m_nCurrentDimension]; if(dA > dB) return 1; else if(dA < dB) return -1; return 0;}int GManifoldSculpting_DataPointSortCompare(void* pThis, void* pA, void* pB){ return ((GManifoldSculpting*)pThis)->DataPointSortCompare((unsigned char*)pA, (unsigned char*)pB);}int GManifoldSculpting::FindMostDistantNeighbor(struct GManifoldSculptingNeighbor* pNeighbors){ int nMostDistant = 0; int n; for(n = 1; n < m_nNeighbors; n++) { if(pNeighbors[n].m_dDistance > pNeighbors[nMostDistant].m_dDistance) nMostDistant = n; } return nMostDistant;}double GManifoldSculpting::CalculateDistance(unsigned char* pA, unsigned char* pB){ pA += m_nValueIndex; pB += m_nValueIndex; double* pdA = (double*)pA; double* pdB = (double*)pB; double dTmp; double dSum = 0; int n; for(n = 0; n < m_nDimensions; n++) { dTmp = pdB[n] - pdA[n]; dSum += (dTmp * dTmp); } return sqrt(dSum);}double GManifoldSculpting::CalculateVectorCorrelation(unsigned char* pA, unsigned char* pVertex, unsigned char* pB){ pA += m_nValueIndex; pVertex += m_nValueIndex; pB += m_nValueIndex; double* pdA = (double*)pA; double* pdV = (double*)pVertex; double* pdB = (double*)pB; double dDotProd = 0; double dMagA = 0; double dMagB = 0; double dA, dB; int n; for(n = 0; n < m_nDimensions; n++) { dA = pdA[n] - pdV[n]; dB = pdB[n] - pdV[n]; dDotProd += (dA * dB); dMagA += (dA * dA); dMagB += (dB * dB); } if(dDotProd == 0) return 0; double dCorrelation = dDotProd / (sqrt(dMagA) * sqrt(dMagB)); GAssert(dCorrelation > -2 && dCorrelation < 2, "out of range"); return dCorrelation;}void GManifoldSculpting::CalculateMetadata(int nTargetDimensions){ // Calculate how far we need to look to have a good chance of finding the // nearest neighbors int nDimSpan = (int)(pow((double)m_nDataPoints, (double)1 / nTargetDimensions) * 2); if(nDimSpan < m_nNeighbors * 2) nDimSpan = m_nNeighbors * 2; // Find the nearest neighbors GPointerArray arr(m_nDataPoints); int n, i, j; for(n = 0; n < m_nDataPoints; n++) arr.AddPointer(&m_pData[n * m_nRecordSize]); // Initialize everybody's neighbors struct GManifoldSculptingNeighbor* pNeighbors; for(n = 0; n < m_nDataPoints; n++) { pNeighbors = (struct GManifoldSculptingNeighbor*)arr.GetPointer(n); for(i = 0; i < m_nNeighbors; i++) { pNeighbors[i].m_pNeighbor = NULL; pNeighbors[i].m_pNeighborsNeighbor = NULL; pNeighbors[i].m_dDistance = 1e100; } } // Find the nearest neighbors int nMostDistantNeighbor; int nStart, nEnd; double dDistance; unsigned char* pCandidate; for(m_nCurrentDimension = 0; m_nCurrentDimension < m_nDimensions; m_nCurrentDimension++) { // Sort on the current dimension arr.Sort(GManifoldSculpting_DataPointSortCompare, this); // Do a pass in this dimension for every data point to search for nearest neighbors for(n = 0; n < m_nDataPoints; n++) { // Check all the data points that are close in this dimension pNeighbors = (struct GManifoldSculptingNeighbor*)arr.GetPointer(n); nMostDistantNeighbor = FindMostDistantNeighbor(pNeighbors); nStart = MAX(0, n - nDimSpan); nEnd = MIN(m_nDataPoints - 1, n + nDimSpan); for(i = nStart; i <= nEnd; i++) { if(i == n) continue; pCandidate = (unsigned char*)arr.GetPointer(i); dDistance = CalculateDistance((unsigned char*)pNeighbors, pCandidate); if(dDistance < pNeighbors[nMostDistantNeighbor].m_dDistance) { // Check to see if this is already a neighbor for(j = 0; j < m_nNeighbors; j++) { if(pNeighbors[j].m_pNeighbor == pCandidate) break; } if(j == m_nNeighbors) { // Make this a neighbor pNeighbors[nMostDistantNeighbor].m_pNeighbor = pCandidate; pNeighbors[nMostDistantNeighbor].m_dDistance = dDistance; nMostDistantNeighbor = FindMostDistantNeighbor(pNeighbors); } } } } } // For each data point, find the most co-linear of each neighbor's neighbors m_dAveNeighborDist = 0; struct GManifoldSculptingMetaData* pData; struct GManifoldSculptingNeighbor* pNeighborsNeighbors; double dCosTheta; for(n = 0; n < m_nDataPoints; n++) { pNeighbors = (struct GManifoldSculptingNeighbor*)arr.GetPointer(n); pData = GetMetaData(pNeighbors); pData->m_nCycle = -1; for(i = 0; i < m_nNeighbors; i++) { m_dAveNeighborDist += pNeighbors[i].m_dDistance; pNeighborsNeighbors = (struct GManifoldSculptingNeighbor*)pNeighbors[i].m_pNeighbor; pCandidate = pNeighborsNeighbors[0].m_pNeighbor; dCosTheta = CalculateVectorCorrelation((unsigned char*)pNeighbors, (unsigned char*)pNeighborsNeighbors, pCandidate); pNeighbors[i].m_dCosTheta = dCosTheta; pNeighbors[i].m_pNeighborsNeighbor = pCandidate; for(j = 1; j < m_nNeighbors; j++) { pCandidate = pNeighborsNeighbors[j].m_pNeighbor; dCosTheta = CalculateVectorCorrelation((unsigned char*)pNeighbors, (unsigned char*)pNeighborsNeighbors, pCandidate); if(dCosTheta < pNeighbors[i].m_dCosTheta) { pNeighbors[i].m_dCosTheta = dCosTheta; pNeighbors[i].m_pNeighborsNeighbor = pCandidate; } } } } m_dAveNeighborDist /= (m_nDataPoints * m_nNeighbors); m_dLearningRate = m_dAveNeighborDist * 10;}double GManifoldSculpting::CalculateDataPointError(unsigned char* pDataPoint){ double dError = 0; double dDist; double dTheta; struct GManifoldSculptingNeighbor* pNeighbors = (struct GManifoldSculptingNeighbor*)pDataPoint; struct GManifoldSculptingMetaData* pDataPointData = (struct GManifoldSculptingMetaData*)(pDataPoint + m_nDataIndex); struct GManifoldSculptingMetaData* pNeighborData; int n; for(n = 0; n < m_nNeighbors; n++) { dDist = CalculateDistance(pDataPoint, pNeighbors[n].m_pNeighbor); dDist -= pNeighbors[n].m_dDistance; dDist /= MAX(m_dAveNeighborDist, 1e-10); dDist *= dDist; dTheta = CalculateVectorCorrelation(pDataPoint, pNeighbors[n].m_pNeighbor, pNeighbors[n].m_pNeighborsNeighbor); dTheta -= pNeighbors[n].m_dCosTheta; dTheta *= dTheta; dTheta = MAX((double)0, dTheta - .01); pNeighborData = (struct GManifoldSculptingMetaData*)(pNeighbors[n].m_pNeighbor + m_nDataIndex); if(pNeighborData->m_nCycle != pDataPointData->m_nCycle && pNeighborData->m_bAdjustable) { dDist /= m_nSmoothingAdvantage; dTheta /= m_nSmoothingAdvantage; } dError += (2 * dDist + dTheta); } return dError * dError;}int GManifoldSculpting::AjustDataPoint(unsigned char* pDataPoint, int nTargetDimensions, double* pError){ bool bMadeProgress = true; double* pValues = (double*)(pDataPoint + m_nValueIndex); double dErrorBase = CalculateDataPointError(pDataPoint); double dError = 0; double dStepSize = m_dLearningRate * (GBits::GetRandomDouble() * .1 + .9); // We multiply the learning rate by a random value so that the points can get away from each other int n, nSteps; for(nSteps = 0; bMadeProgress; nSteps++) { bMadeProgress = false; for(n = 0; n < nTargetDimensions; n++) { pValues[n] += dStepSize; dError = CalculateDataPointError(pDataPoint); if(dError >= dErrorBase) { pValues[n] -= (dStepSize + dStepSize); dError = CalculateDataPointError(pDataPoint); } if(dError >= dErrorBase) pValues[n] += dStepSize; else { dErrorBase = dError; bMadeProgress = true; } } } *pError = dError; return nSteps - 1; // the -1 is to undo the last incrementor}void GManifoldSculpting::SquishBegin(int nTargetDimensions){ GAssert(nTargetDimensions > 0 && nTargetDimensions < m_nDimensions, "out of range"); m_nTargetDimensions = nTargetDimensions; m_nPass = 0; // Calculate metadata CalculateMetadata(nTargetDimensions); // Make the queue m_pQ = new GPointerQueue();}double GManifoldSculpting::SquishPass(int nSeedDataPoint){ double* pValues; struct GManifoldSculptingMetaData* pData; unsigned char* pDataPoint; struct GManifoldSculptingNeighbor* pNeighbors; int n, i; // Squish the extra dimensions for(n = 0; n < m_nDataPoints; n++) { pValues = (double*)(&m_pData[n * m_nRecordSize] + m_nValueIndex); for(i = m_nTargetDimensions; i < m_nDimensions; i++) pValues[i] *= m_dSquishingRate; } // Start at the seed point and correct outward in a breadth-first mannner m_pQ->Push(&m_pData[nSeedDataPoint * m_nRecordSize]); int nVisitedNodes = 0; int nSteps = 0; double dError = 0; double dTotalError = 0; while(m_pQ->GetSize() > 0) { // Check if this one has already been done pDataPoint = (unsigned char*)m_pQ->Pop(); pData = (struct GManifoldSculptingMetaData*)(pDataPoint + m_nDataIndex); if(pData->m_nCycle == m_nPass) continue; pData->m_nCycle = m_nPass; nVisitedNodes++; // Push all neighbors into the queue pNeighbors = (struct GManifoldSculptingNeighbor*)pDataPoint; for(n = 0; n < m_nNeighbors; n++) m_pQ->Push(pNeighbors[n].m_pNeighbor); // Ajust this data point if(pData->m_bAdjustable) { nSteps += AjustDataPoint(pDataPoint, m_nTargetDimensions, &dError); dTotalError += dError; } } //GAssert(nVisitedNodes * 1.25 > m_nDataPoints, "manifold appears poorly sampled"); if(nSteps * 3 < m_nDataPoints) m_dLearningRate *= .9; else if(nSteps > m_nDataPoints * 9) m_dLearningRate /= .9;// printf("[Learning Rate: %f]", m_dLearningRate); m_nPass++; return dTotalError;}/*static*/ GArffData* GManifoldSculpting::DoManifoldSculpting(PreProcessType ePreProcess, GArffRelation* pRelation, GArffData* pData, int nTargetDimensions, int nNeighbors, int nPreludeIterations, int nIterationsSinceBest){ // Make the squisher int nDimensions = pRelation->GetInputCount(); int nDataPoints = pData->GetSize(); GManifoldSculpting squisher(nDataPoints, nDimensions, nNeighbors); squisher.SetData(pRelation, pData); // Initialize and pre-process squisher.SquishBegin(nTargetDimensions); if(ePreProcess == PP_NONE) { } else if(ePreProcess == PP_PCA) { GVector eigenValues; GArffData* pPreprocessedData = GPCA::DoPCA(pRelation, pData, &eigenValues); Holder<GArffData*> hPreprocessedData(pPreprocessedData); squisher.SetData(pRelation, pPreprocessedData); } else if(ePreProcess == PP_LLE) { // LLE seems to require fewer neighbors than ManifoldSculpting. I'm // not sure what the best ratio is. It should probably be another input // parameter, but for now I'll just hack it. int nLLENeighbors = MAX(8, nNeighbors / 2); GArffData* pPreprocessedData = GLLE::DoLLE(pRelation, pData, nLLENeighbors); Holder<GArffData*> hPreprocessedData(pPreprocessedData); squisher.SetData(pRelation, pPreprocessedData); } else { GAssert(false, "Unrecognized pre-processing algorithm"); } // Do the squishing passes double d; double dBestError = 0; int n; for(n = 0; n < nPreludeIterations; n++) dBestError = squisher.SquishPass(rand() % nDataPoints); for(n = 0; n < nIterationsSinceBest; n++) { d = squisher.SquishPass(rand() % nDataPoints); if(d < dBestError) { dBestError = d; n = 0; } } // Convert data to an ARFF file int nAttributeCount = pRelation->GetAttributeCount(); int nOutputs = pRelation->GetOutputCount(); GArffData* pDataOut = new GArffData(nDataPoints); double* pRowIn; double* pRowOut; int i, nIndex; for(n = 0; n < nDataPoints; n++) { pRowIn = pData->GetVector(n); pRowOut = new double[nAttributeCount]; pDataOut->AddVector(pRowOut); // Copy the output values straight over for(i = 0; i < nOutputs; i++) { nIndex = pRelation->GetOutputIndex(i); pRowOut[nIndex] = pRowIn[nIndex]; } // Copy the squished input values pRowIn = squisher.GetDataPoint(n); for(i = 0; i < nDimensions; i++) { nIndex = pRelation->GetInputIndex(i); pRowOut[nIndex] = pRowIn[i];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -