gmanifold.cpp
来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 1,591 行 · 第 1/3 页
CPP
1,591 行
pSquaredDistances[i] = pPoint->GetSquaredDist(); } else { pNeighbors[i] = -1; pSquaredDistances[i] = 1e200; } delete(pPoint); } while(m_pPriorityQueue->GetSize() > 0) delete(m_pPriorityQueue->Unlink(0)); } int CompareDims(int a, int b) { if(m_pPrincipleComponent[a] > m_pPrincipleComponent[b]) return -1; else return 1; }#ifndef NO_TEST_CODE static void Test() { // Generate the data int nDimensions = 10; int nNeighbors = 20; int nPoints = 500; int i, j, k; GArffData data(nPoints); double* pVector; for(i = 0; i < nPoints; i++) { pVector = new double[nDimensions]; for(j = 0; j < nDimensions; j++) pVector[j] = GBits::GetRandomDouble() * 100; data.AddVector(pVector); } // Find neighbors double* pVector2; int* pNeighbors = new int[nNeighbors]; ArrayHolder<int*> hNeighbors(pNeighbors); double* pDistances = new double[nNeighbors]; ArrayHolder<double*> hDistances(pDistances); GYetAnotherNeighborFinder gnf(nDimensions, &data); for(i = 0; i < nPoints; i++) { // Find the neighbors pVector = data.GetVector(i); gnf.FindNeighbors(pNeighbors, pDistances, nNeighbors, pVector, i); // Check the answer int nWorstNeighbor = -1; double dWorstDistance = 0; double d; for(j = 0; j < nNeighbors; j++) { if(pNeighbors[j] < 0) throw "Didn't retrieve enough neighbors"; d = GVector::ComputeSquaredDistance(pVector, data.GetVector(pNeighbors[j]), nDimensions); if(d > dWorstDistance) { dWorstDistance = d; nWorstNeighbor = j; } } for(j = 0; j < nPoints; j++) { pVector2 = data.GetVector(j); d = GVector::ComputeSquaredDistance(pVector, pVector2, nDimensions); if(d == 0) { } else if(d < dWorstDistance) { for(k = 0; k < nNeighbors; k++) { if(k > 0) { // This isn't a complete check, but it should be good enough if(pNeighbors[k] == pNeighbors[k - 1]) throw "same neighbor twice"; if(pNeighbors[k] == pNeighbors[0]) throw "same neighbor twice"; } if(pNeighbors[k] == i) throw "failed to exclude itself"; if(data.GetVector(pNeighbors[k]) == pVector2) break; } if(k >= nNeighbors) throw "missed a neighbor"; } } } }#endif // !NO_TEST_CODE};int GYetAnotherNeighborFinderDimSorter(void* pThis, int a, int b){ return ((GYetAnotherNeighborFinder*)pThis)->CompareDims(a, b);}GYetAnotherNeighborFinder::GYetAnotherNeighborFinder(int nDims, GArffData* pData) : m_dimOrder(nDims){ m_pPriorityQueue = new GAVLTree(); m_nDims = nDims; m_pData = pData; m_pPrincipleComponent = new double[nDims]; pData->ComputePrincipleComponent(nDims, m_pPrincipleComponent, 10); int i; for(i = 0; i < nDims; i++) m_dimOrder.AddInt(i); m_dimOrder.Sort(GYetAnotherNeighborFinderDimSorter, this);}*/// --------------------------------------------------------------------struct GManifoldSculptingNeighbor{ int m_nNeighbor; int m_nNeighborsNeighbor; double m_dCosTheta; double m_dDistance;};struct GManifoldSculptingMetaData{ int m_nCycle; bool m_bAdjustable;};GManifoldSculpting::GManifoldSculpting(int nDataPoints, int nDimensions, int nNeighbors){ m_nDataPoints = nDataPoints; m_nDimensions = nDimensions; m_nNeighbors = nNeighbors; m_nDataIndex = sizeof(struct GManifoldSculptingNeighbor) * m_nNeighbors; m_nValueIndex = m_nDataIndex + sizeof(struct GManifoldSculptingMetaData); m_nRecordSize = m_nValueIndex + sizeof(double) * m_nDimensions; m_pData = new unsigned char[m_nRecordSize * nDataPoints]; m_dSquishingRate = .99; m_nTargetDimensions = 0; m_nPass = 0; m_dAveNeighborDist = 0; m_pQ = NULL; m_nSmoothingAdvantage = 10;}GManifoldSculpting::~GManifoldSculpting(){ delete(m_pData); delete(m_pQ);}void GManifoldSculpting::SetVector(int n, double* pValues, bool bAdjustable){ memcpy(GetVector(n), pValues, sizeof(double) * m_nDimensions); struct GManifoldSculptingMetaData* pData = GetMetaData(n); pData->m_bAdjustable = bAdjustable;}void GManifoldSculpting::SetData(GArffRelation* pRelation, GArffData* pData){ GAssert(pData->GetSize() == m_nDataPoints, "wrong number of data points"); int nInputs = pRelation->GetInputCount(); GTEMPBUF(double, pInputs, nInputs); double* pRow; int n, i; for(n = 0; n < m_nDataPoints; n++) { pRow = pData->GetVector(n); for(i = 0; i < nInputs; i++) pInputs[i] = pRow[pRelation->GetInputIndex(i)]; SetVector(n, pInputs, true); }}int GManifoldSculpting::FindMostDistantNeighbor(struct GManifoldSculptingNeighbor* pNeighbors){ int nMostDistant = 0; int n; for(n = 1; n < m_nNeighbors; n++) { if(pNeighbors[n].m_dDistance > pNeighbors[nMostDistant].m_dDistance) nMostDistant = n; } return nMostDistant;}int GManifoldSculpting::CountShortcuts(int nThreshold){ int nShortcuts = 0; int n, i; for(n = 0; n < m_nDataPoints; n++) { struct GManifoldSculptingNeighbor* pPoint = GetRecord(n); for(i = 0; i < m_nNeighbors; i++) { if(ABS(pPoint[i].m_nNeighbor - n) >= nThreshold) { printf("shortcut: %d, %d\n", n, pPoint[i].m_nNeighbor); nShortcuts++; } } } return nShortcuts;}double GManifoldSculpting::CalculateDistance(int nPoint1, int nPoint2){ double* pdA = GetVector(nPoint1); double* pdB = GetVector(nPoint2); double dTmp; double dSum = 0; int n; for(n = 0; n < m_nDimensions; n++) { dTmp = pdB[n] - pdA[n]; dSum += (dTmp * dTmp); } return sqrt(dSum);}double GManifoldSculpting::CalculateVectorCorrelation(int a, int vertex, int b){ double* pdA = GetVector(a); double* pdV = GetVector(vertex); double* pdB = GetVector(b); double dDotProd = 0; double dMagA = 0; double dMagB = 0; double dA, dB; int n; for(n = 0; n < m_nDimensions; n++) { dA = pdA[n] - pdV[n]; dB = pdB[n] - pdV[n]; dDotProd += (dA * dB); dMagA += (dA * dA); dMagB += (dB * dB); }#ifdef DOT_PRODUCT_ONLY return dDotProd;#else // DOT_PRODUCT_ONLY if(dDotProd == 0) return 0; double dCorrelation = dDotProd / (sqrt(dMagA) * sqrt(dMagB)); GAssert(dCorrelation > -2 && dCorrelation < 2, "out of range"); return dCorrelation;#endif // !DOT_PRODUCT_ONLY}void GManifoldSculpting::CalculateMetadata(int nTargetDimensions){ int i, j; { // Make a temporary GArffRelation and GArffData to use with GNeighborFinder GArffRelation relation; for(i = 0; i < m_nDimensions; i++) relation.AddAttribute(new GArffAttribute(true, 0, NULL)); GArffData data(m_nDataPoints); for(i = 0; i < m_nDataPoints; i++) data.AddVector(GetVector(i)); // Find the neighbors //GManifoldNeighborFinder neighborFinder(&relation, &data, MAX(m_nNeighbors, 8), GManifoldNeighborFinder::principle_component_8); //GYetAnotherNeighborFinder neighborFinder(m_nDimensions, &data); GNeighborFinder neighborFinder(&relation, &data, MAX(m_nNeighbors, 8)); GTEMPBUF(int, pNeighbors, m_nNeighbors); GTEMPBUF(double, pSquaredDistances, m_nNeighbors); for(i = 0; i < m_nDataPoints; i++) { neighborFinder.FindNeighbors(pNeighbors, pSquaredDistances, m_nNeighbors, GetVector(i), i); struct GManifoldSculptingNeighbor* pArrNeighbors = GetRecord(i); for(j = 0; j < m_nNeighbors; j++) { GAssert(pNeighbors[j] >= 0, "failed to find 'k' neighbors"); pArrNeighbors[j].m_nNeighbor = pNeighbors[j]; pArrNeighbors[j].m_nNeighborsNeighbor = -1; pArrNeighbors[j].m_dDistance = sqrt(pSquaredDistances[j]); } } data.DropAllVectors(); } // For each data point, find the most co-linear of each neighbor's neighbors m_dAveNeighborDist = 0; int n, nVertex, nCandidate; struct GManifoldSculptingNeighbor* pPoint; struct GManifoldSculptingNeighbor* pVertex; double dCosTheta; for(n = 0; n < m_nDataPoints; n++) { pPoint = GetRecord(n); GetMetaData(pPoint)->m_nCycle = -1; for(i = 0; i < m_nNeighbors; i++) { m_dAveNeighborDist += pPoint[i].m_dDistance; nVertex = pPoint[i].m_nNeighbor; pVertex = GetRecord(nVertex); nCandidate = pVertex[0].m_nNeighbor; dCosTheta = CalculateVectorCorrelation(n, nVertex, nCandidate); pPoint[i].m_dCosTheta = dCosTheta; pPoint[i].m_nNeighborsNeighbor = nCandidate; for(j = 1; j < m_nNeighbors; j++) { nCandidate = pVertex[j].m_nNeighbor; dCosTheta = CalculateVectorCorrelation(n, nVertex, nCandidate); if(dCosTheta < pPoint[i].m_dCosTheta) { pPoint[i].m_dCosTheta = dCosTheta; pPoint[i].m_nNeighborsNeighbor = nCandidate; } }#ifndef DOT_PRODUCT_ONLY if(nTargetDimensions < 2) pPoint[i].m_dCosTheta = GBits::GetSign(pPoint[i].m_dCosTheta);#endif // !DOT_PRODUCT_ONLY } } m_dAveNeighborDist /= (m_nDataPoints * m_nNeighbors); m_dLearningRate = m_dAveNeighborDist * 10;}double GManifoldSculpting::CalculateDataPointError(int nPoint){ double dError = 0; double dDist; double dTheta; struct GManifoldSculptingNeighbor* pPoint = GetRecord(nPoint); struct GManifoldSculptingMetaData* pDataPointData = GetMetaData(pPoint); struct GManifoldSculptingMetaData* pNeighborData; int n, nNeighbor;#ifdef LINEAR_WEIGHTING double dWeight; double dLinearSum = 0;#endif // LINEAR_WEIGHTING for(n = 0; n < m_nNeighbors; n++) { nNeighbor = pPoint[n].m_nNeighbor; dTheta = CalculateVectorCorrelation(nPoint, nNeighbor, pPoint[n].m_nNeighborsNeighbor); dTheta -= pPoint[n].m_dCosTheta; dTheta *= dTheta;#ifdef DOT_PRODUCT_ONLY pNeighborData = GetMetaData(nNeighbor); if(pNeighborData->m_nCycle != pDataPointData->m_nCycle && pNeighborData->m_bAdjustable) dTheta /= m_nSmoothingAdvantage; dError += dTheta;#else // DOT_PRODUCT_ONLY //dTheta = MAX((double)0, dTheta - .01); dDist = CalculateDistance(nPoint, nNeighbor); dDist -= pPoint[n].m_dDistance; dDist /= MAX(m_dAveNeighborDist, 1e-10); dDist *= dDist; pNeighborData = GetMetaData(nNeighbor); if(pNeighborData->m_nCycle != pDataPointData->m_nCycle && pNeighborData->m_bAdjustable) { dDist /= m_nSmoothingAdvantage; dTheta /= m_nSmoothingAdvantage; }# ifdef LINEAR_WEIGHTING dWeight = 1.0 / MAX(1e-9, pPoint[n].m_dDistance); dLinearSum += dWeight; dError += (.5 * dDist + dTheta) * dWeight;# else // LINEAR_WEIGHTING dError += (2 * dDist + dTheta);# endif // !LINEAR_WEIGHTING#endif // !DOT_PRODUCT_ONLY }# ifdef LINEAR_WEIGHTING dError /= dLinearSum;# endif // !LINEAR_WEIGHTING return dError * dError;}int GManifoldSculpting::AdjustDataPoint(int nPoint, int nTargetDimensions, double* pError){ bool bMadeProgress = true; double* pValues = GetVector(nPoint); double dErrorBase = CalculateDataPointError(nPoint); double dError = 0; double dStepSize = m_dLearningRate * (GBits::GetRandomDouble() * .4 + .6); // We multiply the learning rate by a random value so that the points can get away from each other int n, nSteps; for(nSteps = 0; bMadeProgress; nSteps++) { bMadeProgress = false; for(n = 0; n < nTargetDimensions; n++) { pValues[n] += dStepSize; dError = CalculateDataPointError(nPoint); if(dError >= dErrorBase) { pValues[n] -= (dStepSize + dStepSize); dError = CalculateDataPointError(nPoint); } if(dError >= dErrorBase) pValues[n] += dStepSize; else { dErrorBase = dError; bMadeProgress = true; } } } *pError = dError; return nSteps - 1; // the -1 is to undo the last incrementor}void GManifoldSculpting::SquishBegin(int nTargetDimensions){ GAssert(nTargetDimensions > 0 && nTargetDimensions < m_nDimensions, "out of range"); m_nTargetDimensions = nTargetDimensions; m_nPass = 0; // Calculate metadata CalculateMetadata(nTargetDimensions); // Make the queue m_pQ = new GIntQueue();}double GManifoldSculpting::SquishPass(int nSeedDataPoint){ int n, i, nPoint; double* pValues; struct GManifoldSculptingNeighbor* pPoint; struct GManifoldSculptingMetaData* pData; // Squish the extra dimensions for(n = 0; n < m_nDataPoints; n++) { pValues = GetVector(n); for(i = m_nTargetDimensions; i < m_nDimensions; i++) pValues[i] *= m_dSquishingRate; } // Start at the seed point and correct outward in a breadth-first mannner m_pQ->Push(nSeedDataPoint); int nVisitedNodes = 0; int nSteps = 0; double dError = 0; double dTotalError = 0; while(m_pQ->GetSize() > 0) { // Check if this one has already been done nPoint = m_pQ->Pop(); pPoint = GetRecord(nPoint); pData = GetMetaData(pPoint); if(pData->m_nCycle == m_nPass) continue; pData->m_nCycle = m_nPass; nVisitedNodes++; // Push all neighbors into the queue for(n = 0; n < m_nNeighbors; n++) m_pQ->Push(pPoint[n].m_nNeighbor); // Adjust this data point if(pData->m_bAdjustable) { nSteps += AdjustDataPoint(nPoint, m_nTargetDimensions, &dError); dTotalError += dError; } } //GAssert(nVisitedNodes * 1.25 > m_nDataPoints, "manifold appears poorly sampled"); if(nSteps < m_nDataPoints * 2) m_dLearningRate *= .87; else m_dLearningRate /= .91;// printf("[Learning Rate: %f]", m_dLearningRate); m_nPass++; return dTotalError;}/*static*/ GArffData* GManifoldSculpting::DoManifoldSculpting(PreProcessType ePreProcess, GArffRelation* pRelation, GArffData* pData, int nTargetDimensions, int nNeighbors, int nPreludeIterations, int nIterationsSinceBest){ // Make the squisher int nDimensions = pRelation->GetInputCount(); int nDataPoints = pData->GetSize(); GManifoldSculpting squisher(nDataPoints, nDimensions, nNeighbors); squisher.SetData(pRelation, pData); // Initialize and pre-process squisher.SquishBegin(nTargetDimensions); if(ePreProcess == PP_NONE) { } else if(ePreProcess == PP_PCA) { GArffData* pPreprocessedData = GPCA::DoPCA(pRelation, pData); Holder<GArffData*> hPreprocessedData(pPreprocessedData); squisher.SetData(pRelation, pPreprocessedData); } else if(ePreProcess == PP_LLE) { // LLE seems to require fewer neighbors than ManifoldSculpting. I'm // not sure what the best ratio is. It should probably be another input // parameter, but for now I'll just hack it. int nLLENeighbors = MAX(8, nNeighbors / 2); GArffData* pPreprocessedData = GLLE::DoLLE(pRelation, pData, nLLENeighbors); Holder<GArffData*> hPreprocessedData(pPreprocessedData); squisher.SetData(pRelation, pPreprocessedData); } else { GAssert(false, "Unrecognized pre-processing algorithm"); } // Do the squishing passes double d; double dBestError = 0; int n; for(n = 0; n < nPreludeIterations; n++) dBestError = squisher.SquishPass(rand() % nDataPoints); for(n = 0; n < nIterationsSinceBest; n++) { d = squisher.SquishPass(rand() % nDataPoints); if(d < dBestError) { dBestError = d; n = 0; } }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?