gmanifold.cpp

来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 1,591 行 · 第 1/3 页

CPP
1,591
字号
				pSquaredDistances[i] = pPoint->GetSquaredDist();			}			else			{				pNeighbors[i] = -1;				pSquaredDistances[i] = 1e200;			}			delete(pPoint);		}		while(m_pPriorityQueue->GetSize() > 0)			delete(m_pPriorityQueue->Unlink(0));	}	int CompareDims(int a, int b)	{		if(m_pPrincipleComponent[a] > m_pPrincipleComponent[b])			return -1;		else			return 1;	}#ifndef NO_TEST_CODE	static void Test()	{		// Generate the data		int nDimensions = 10;		int nNeighbors = 20;		int nPoints = 500;		int i, j, k;		GArffData data(nPoints);		double* pVector;		for(i = 0; i < nPoints; i++)		{			pVector = new double[nDimensions];			for(j = 0; j < nDimensions; j++)				pVector[j] = GBits::GetRandomDouble() * 100;			data.AddVector(pVector);		}		// Find neighbors		double* pVector2;		int* pNeighbors = new int[nNeighbors];		ArrayHolder<int*> hNeighbors(pNeighbors);		double* pDistances = new double[nNeighbors];		ArrayHolder<double*> hDistances(pDistances);		GYetAnotherNeighborFinder gnf(nDimensions, &data);		for(i = 0; i < nPoints; i++)		{			// Find the neighbors			pVector = data.GetVector(i);			gnf.FindNeighbors(pNeighbors, pDistances, nNeighbors, pVector, i);			// Check the answer			int nWorstNeighbor = -1;			double dWorstDistance = 0;			double d;			for(j = 0; j < nNeighbors; j++)			{				if(pNeighbors[j] < 0)					throw "Didn't retrieve enough neighbors";				d = GVector::ComputeSquaredDistance(pVector, data.GetVector(pNeighbors[j]), nDimensions);				if(d > dWorstDistance)				{					dWorstDistance = d;					nWorstNeighbor = j;				}			}			for(j = 0; j < nPoints; j++)			{				pVector2 = data.GetVector(j);				d = GVector::ComputeSquaredDistance(pVector, pVector2, nDimensions);				if(d == 0)				{				}				else if(d < dWorstDistance)				{					for(k = 0; k < nNeighbors; k++)					{						if(k > 0)						{							// This isn't a complete check, but it should be good enough							if(pNeighbors[k] == pNeighbors[k - 1])								throw "same neighbor twice";							if(pNeighbors[k] == pNeighbors[0])								throw "same neighbor twice";						}						if(pNeighbors[k] == i)							throw "failed to exclude itself";						if(data.GetVector(pNeighbors[k]) == pVector2)							break;					}					if(k >= nNeighbors)						throw "missed a neighbor";				}			}		}	}#endif // !NO_TEST_CODE};int GYetAnotherNeighborFinderDimSorter(void* pThis, int a, int b){	return ((GYetAnotherNeighborFinder*)pThis)->CompareDims(a, b);}GYetAnotherNeighborFinder::GYetAnotherNeighborFinder(int nDims, GArffData* pData)	: m_dimOrder(nDims){	m_pPriorityQueue = new GAVLTree();	m_nDims = nDims;	m_pData = pData;	m_pPrincipleComponent = new double[nDims];	pData->ComputePrincipleComponent(nDims, m_pPrincipleComponent, 10);	int i;	for(i = 0; i < nDims; i++)		m_dimOrder.AddInt(i);	m_dimOrder.Sort(GYetAnotherNeighborFinderDimSorter, this);}*/// --------------------------------------------------------------------struct GManifoldSculptingNeighbor{	int m_nNeighbor;	int m_nNeighborsNeighbor;	double m_dCosTheta;	double m_dDistance;};struct GManifoldSculptingMetaData{	int m_nCycle;	bool m_bAdjustable;};GManifoldSculpting::GManifoldSculpting(int nDataPoints, int nDimensions, int nNeighbors){	m_nDataPoints = nDataPoints;	m_nDimensions = nDimensions;	m_nNeighbors = nNeighbors;	m_nDataIndex = sizeof(struct GManifoldSculptingNeighbor) * m_nNeighbors;	m_nValueIndex = m_nDataIndex + sizeof(struct GManifoldSculptingMetaData);	m_nRecordSize = m_nValueIndex + sizeof(double) * m_nDimensions;	m_pData = new unsigned char[m_nRecordSize * nDataPoints];	m_dSquishingRate = .99;	m_nTargetDimensions = 0;	m_nPass = 0;	m_dAveNeighborDist = 0;	m_pQ = NULL;	m_nSmoothingAdvantage = 10;}GManifoldSculpting::~GManifoldSculpting(){	delete(m_pData);	delete(m_pQ);}void GManifoldSculpting::SetVector(int n, double* pValues, bool bAdjustable){	memcpy(GetVector(n), pValues, sizeof(double) * m_nDimensions);	struct GManifoldSculptingMetaData* pData = GetMetaData(n);	pData->m_bAdjustable = bAdjustable;}void GManifoldSculpting::SetData(GArffRelation* pRelation, GArffData* pData){	GAssert(pData->GetSize() == m_nDataPoints, "wrong number of data points");	int nInputs = pRelation->GetInputCount();	GTEMPBUF(double, pInputs, nInputs);	double* pRow;	int n, i;	for(n = 0; n < m_nDataPoints; n++)	{		pRow = pData->GetVector(n);		for(i = 0; i < nInputs; i++)			pInputs[i] = pRow[pRelation->GetInputIndex(i)];		SetVector(n, pInputs, true);	}}int GManifoldSculpting::FindMostDistantNeighbor(struct GManifoldSculptingNeighbor* pNeighbors){	int nMostDistant = 0;	int n;	for(n = 1; n < m_nNeighbors; n++)	{		if(pNeighbors[n].m_dDistance > pNeighbors[nMostDistant].m_dDistance)			nMostDistant = n;	}	return nMostDistant;}int GManifoldSculpting::CountShortcuts(int nThreshold){	int nShortcuts = 0;	int n, i;	for(n = 0; n < m_nDataPoints; n++)	{		struct GManifoldSculptingNeighbor* pPoint = GetRecord(n);		for(i = 0; i < m_nNeighbors; i++)		{			if(ABS(pPoint[i].m_nNeighbor - n) >= nThreshold)			{				printf("shortcut: %d, %d\n", n, pPoint[i].m_nNeighbor);				nShortcuts++;			}		}	}	return nShortcuts;}double GManifoldSculpting::CalculateDistance(int nPoint1, int nPoint2){	double* pdA = GetVector(nPoint1);	double* pdB = GetVector(nPoint2);	double dTmp;	double dSum = 0;	int n;	for(n = 0; n < m_nDimensions; n++)	{		dTmp = pdB[n] - pdA[n];		dSum += (dTmp * dTmp);	}	return sqrt(dSum);}double GManifoldSculpting::CalculateVectorCorrelation(int a, int vertex, int b){	double* pdA = GetVector(a);	double* pdV = GetVector(vertex);	double* pdB = GetVector(b);	double dDotProd = 0;	double dMagA = 0;	double dMagB = 0;	double dA, dB;	int n;	for(n = 0; n < m_nDimensions; n++)	{		dA = pdA[n] - pdV[n];		dB = pdB[n] - pdV[n];		dDotProd += (dA * dB);		dMagA += (dA * dA);		dMagB += (dB * dB);	}#ifdef DOT_PRODUCT_ONLY	return dDotProd;#else // DOT_PRODUCT_ONLY	if(dDotProd == 0)		return 0;	double dCorrelation = dDotProd / (sqrt(dMagA) * sqrt(dMagB));	GAssert(dCorrelation > -2 && dCorrelation < 2, "out of range");	return dCorrelation;#endif // !DOT_PRODUCT_ONLY}void GManifoldSculpting::CalculateMetadata(int nTargetDimensions){	int i, j;	{		// Make a temporary GArffRelation and GArffData to use with GNeighborFinder		GArffRelation relation;		for(i = 0; i < m_nDimensions; i++)			relation.AddAttribute(new GArffAttribute(true, 0, NULL));		GArffData data(m_nDataPoints);		for(i = 0; i < m_nDataPoints; i++)			data.AddVector(GetVector(i));		// Find the neighbors		//GManifoldNeighborFinder neighborFinder(&relation, &data, MAX(m_nNeighbors, 8), GManifoldNeighborFinder::principle_component_8);		//GYetAnotherNeighborFinder neighborFinder(m_nDimensions, &data);		GNeighborFinder neighborFinder(&relation, &data, MAX(m_nNeighbors, 8));		GTEMPBUF(int, pNeighbors, m_nNeighbors);		GTEMPBUF(double, pSquaredDistances, m_nNeighbors);		for(i = 0; i < m_nDataPoints; i++)		{			neighborFinder.FindNeighbors(pNeighbors, pSquaredDistances, m_nNeighbors, GetVector(i), i);			struct GManifoldSculptingNeighbor* pArrNeighbors = GetRecord(i);			for(j = 0; j < m_nNeighbors; j++)			{				GAssert(pNeighbors[j] >= 0, "failed to find 'k' neighbors");				pArrNeighbors[j].m_nNeighbor = pNeighbors[j];				pArrNeighbors[j].m_nNeighborsNeighbor = -1;				pArrNeighbors[j].m_dDistance = sqrt(pSquaredDistances[j]);			}		}		data.DropAllVectors();	}	// For each data point, find the most co-linear of each neighbor's neighbors	m_dAveNeighborDist = 0;	int n, nVertex, nCandidate;	struct GManifoldSculptingNeighbor* pPoint;	struct GManifoldSculptingNeighbor* pVertex;	double dCosTheta;	for(n = 0; n < m_nDataPoints; n++)	{		pPoint = GetRecord(n);		GetMetaData(pPoint)->m_nCycle = -1;		for(i = 0; i < m_nNeighbors; i++)		{			m_dAveNeighborDist += pPoint[i].m_dDistance;			nVertex = pPoint[i].m_nNeighbor;			pVertex = GetRecord(nVertex);			nCandidate = pVertex[0].m_nNeighbor;			dCosTheta = CalculateVectorCorrelation(n, nVertex, nCandidate);			pPoint[i].m_dCosTheta = dCosTheta;			pPoint[i].m_nNeighborsNeighbor = nCandidate;			for(j = 1; j < m_nNeighbors; j++)			{				nCandidate = pVertex[j].m_nNeighbor;				dCosTheta = CalculateVectorCorrelation(n, nVertex, nCandidate);				if(dCosTheta < pPoint[i].m_dCosTheta)				{					pPoint[i].m_dCosTheta = dCosTheta;					pPoint[i].m_nNeighborsNeighbor = nCandidate;				}			}#ifndef DOT_PRODUCT_ONLY			if(nTargetDimensions < 2)				pPoint[i].m_dCosTheta = GBits::GetSign(pPoint[i].m_dCosTheta);#endif // !DOT_PRODUCT_ONLY		}	}	m_dAveNeighborDist /= (m_nDataPoints * m_nNeighbors);	m_dLearningRate = m_dAveNeighborDist * 10;}double GManifoldSculpting::CalculateDataPointError(int nPoint){	double dError = 0;	double dDist;	double dTheta;	struct GManifoldSculptingNeighbor* pPoint = GetRecord(nPoint);	struct GManifoldSculptingMetaData* pDataPointData = GetMetaData(pPoint);	struct GManifoldSculptingMetaData* pNeighborData;	int n, nNeighbor;#ifdef LINEAR_WEIGHTING	double dWeight;	double dLinearSum = 0;#endif // LINEAR_WEIGHTING	for(n = 0; n < m_nNeighbors; n++)	{		nNeighbor = pPoint[n].m_nNeighbor;		dTheta = CalculateVectorCorrelation(nPoint, nNeighbor, pPoint[n].m_nNeighborsNeighbor);		dTheta -= pPoint[n].m_dCosTheta;		dTheta *= dTheta;#ifdef DOT_PRODUCT_ONLY		pNeighborData = GetMetaData(nNeighbor);		if(pNeighborData->m_nCycle != pDataPointData->m_nCycle && pNeighborData->m_bAdjustable)			dTheta /= m_nSmoothingAdvantage;		dError += dTheta;#else // DOT_PRODUCT_ONLY		//dTheta = MAX((double)0, dTheta - .01);		dDist = CalculateDistance(nPoint, nNeighbor);		dDist -= pPoint[n].m_dDistance;		dDist /= MAX(m_dAveNeighborDist, 1e-10);		dDist *= dDist;		pNeighborData = GetMetaData(nNeighbor);		if(pNeighborData->m_nCycle != pDataPointData->m_nCycle && pNeighborData->m_bAdjustable)		{			dDist /= m_nSmoothingAdvantage;			dTheta /= m_nSmoothingAdvantage;		}#	ifdef LINEAR_WEIGHTING		dWeight = 1.0 / MAX(1e-9, pPoint[n].m_dDistance);		dLinearSum += dWeight;		dError += (.5 * dDist + dTheta) * dWeight;#	else // LINEAR_WEIGHTING		dError += (2 * dDist + dTheta);#	endif // !LINEAR_WEIGHTING#endif // !DOT_PRODUCT_ONLY	}#	ifdef LINEAR_WEIGHTING	dError /= dLinearSum;#	endif // !LINEAR_WEIGHTING	return dError * dError;}int GManifoldSculpting::AdjustDataPoint(int nPoint, int nTargetDimensions, double* pError){	bool bMadeProgress = true;	double* pValues = GetVector(nPoint);	double dErrorBase = CalculateDataPointError(nPoint);	double dError = 0;	double dStepSize = m_dLearningRate * (GBits::GetRandomDouble() * .4 + .6); // We multiply the learning rate by a random value so that the points can get away from each other	int n, nSteps;	for(nSteps = 0; bMadeProgress; nSteps++)	{		bMadeProgress = false;		for(n = 0; n < nTargetDimensions; n++)		{			pValues[n] += dStepSize;			dError = CalculateDataPointError(nPoint);			if(dError >= dErrorBase)			{				pValues[n] -= (dStepSize + dStepSize);				dError = CalculateDataPointError(nPoint);			}			if(dError >= dErrorBase)				pValues[n] += dStepSize;			else			{				dErrorBase = dError;				bMadeProgress = true;			}		}	}	*pError = dError;	return nSteps - 1; // the -1 is to undo the last incrementor}void GManifoldSculpting::SquishBegin(int nTargetDimensions){	GAssert(nTargetDimensions > 0 && nTargetDimensions < m_nDimensions, "out of range");	m_nTargetDimensions = nTargetDimensions;	m_nPass = 0;	// Calculate metadata	CalculateMetadata(nTargetDimensions);	// Make the queue	m_pQ = new GIntQueue();}double GManifoldSculpting::SquishPass(int nSeedDataPoint){	int n, i, nPoint;	double* pValues;	struct GManifoldSculptingNeighbor* pPoint;	struct GManifoldSculptingMetaData* pData;	// Squish the extra dimensions	for(n = 0; n < m_nDataPoints; n++)	{		pValues = GetVector(n);		for(i = m_nTargetDimensions; i < m_nDimensions; i++)			pValues[i] *= m_dSquishingRate;	}	// Start at the seed point and correct outward in a breadth-first mannner	m_pQ->Push(nSeedDataPoint);	int nVisitedNodes = 0;	int nSteps = 0;	double dError = 0;	double dTotalError = 0;	while(m_pQ->GetSize() > 0)	{		// Check if this one has already been done		nPoint = m_pQ->Pop();		pPoint = GetRecord(nPoint);		pData = GetMetaData(pPoint);		if(pData->m_nCycle == m_nPass)			continue;		pData->m_nCycle = m_nPass;		nVisitedNodes++;		// Push all neighbors into the queue		for(n = 0; n < m_nNeighbors; n++)			m_pQ->Push(pPoint[n].m_nNeighbor);		// Adjust this data point		if(pData->m_bAdjustable)		{			nSteps += AdjustDataPoint(nPoint, m_nTargetDimensions, &dError);			dTotalError += dError;		}	}	//GAssert(nVisitedNodes * 1.25 > m_nDataPoints, "manifold appears poorly sampled");	if(nSteps < m_nDataPoints * 2)		m_dLearningRate *= .87;	else		m_dLearningRate /= .91;//	printf("[Learning Rate: %f]", m_dLearningRate);	m_nPass++;	return dTotalError;}/*static*/ GArffData* GManifoldSculpting::DoManifoldSculpting(PreProcessType ePreProcess, GArffRelation* pRelation, GArffData* pData, int nTargetDimensions, int nNeighbors, int nPreludeIterations, int nIterationsSinceBest){	// Make the squisher	int nDimensions = pRelation->GetInputCount();	int nDataPoints = pData->GetSize();	GManifoldSculpting squisher(nDataPoints, nDimensions, nNeighbors);	squisher.SetData(pRelation, pData);	// Initialize and pre-process	squisher.SquishBegin(nTargetDimensions);	if(ePreProcess == PP_NONE)	{	}	else if(ePreProcess == PP_PCA)	{		GArffData* pPreprocessedData = GPCA::DoPCA(pRelation, pData);		Holder<GArffData*> hPreprocessedData(pPreprocessedData);		squisher.SetData(pRelation, pPreprocessedData);	}	else if(ePreProcess == PP_LLE)	{		// LLE seems to require fewer neighbors than ManifoldSculpting. I'm		// not sure what the best ratio is. It should probably be another input		// parameter, but for now I'll just hack it.		int nLLENeighbors = MAX(8, nNeighbors / 2);		GArffData* pPreprocessedData = GLLE::DoLLE(pRelation, pData, nLLENeighbors);		Holder<GArffData*> hPreprocessedData(pPreprocessedData);		squisher.SetData(pRelation, pPreprocessedData);	}	else	{		GAssert(false, "Unrecognized pre-processing algorithm");	}	// Do the squishing passes	double d;	double dBestError = 0;	int n;	for(n = 0; n < nPreludeIterations; n++)		dBestError = squisher.SquishPass(rand() % nDataPoints);	for(n = 0; n < nIterationsSinceBest; n++)	{		d = squisher.SquishPass(rand() % nDataPoints);		if(d < dBestError)		{			dBestError = d;			n = 0;		}	}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?