gknn.cpp

来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 1,007 行 · 第 1/2 页

CPP
1,007
字号
	// Drop the vector	pVector = m_pData->GetVector(nIndex);	m_pData->SetPointer(nIndex, m_pData->GetVector(nOldIndex));	m_pData->DeleteCell(nOldIndex);	return pVector;}GNeighborFinderLeaf* GNeighborFinder::FindCell(double* pVector){	// Find the left node to hold the vector	GNeighborFinderNode* pNode = m_pRoot;	while(true)	{		if(pNode->IsLeaf())			return (GNeighborFinderLeaf*)pNode;		else		{			GNeighborFinderInterior* pInterior = (GNeighborFinderInterior*)pNode;			if(pVector[m_pRelation->GetInputIndex(pInterior->GetInput())] >= pInterior->GetPivot())				pNode = pInterior->GetGreater();			else				pNode = pInterior->GetLesser();		}	}}void GNeighborFinder::GetNeighborsFromCell(GNeighborFinderLeaf* pCell, double* pVector, int* pOutNeighbors, double* pOutSquaredDistances, int nNeighbors, int* pnWorstNeighbor, int nExclude){	GIntArray* pCellData = pCell->GetData();	int nCount = pCellData->GetSize();	int i, j, index;	double d;	for(i = 0; i < nCount; i++)	{		index = pCellData->GetInt(i);		if(index == nExclude)			continue;		double* pCandidate = m_pData->GetVector(index);		d = m_pRelation->ComputeInputDistanceSquared(pCandidate, pVector);		if(d < pOutSquaredDistances[*pnWorstNeighbor])		{			pOutNeighbors[*pnWorstNeighbor] = index;			pOutSquaredDistances[*pnWorstNeighbor] = d;			*pnWorstNeighbor = 0;			for(j = 1; j < nNeighbors; j++) // todo: a priority queue could make this faster when the number of neighbors is large			{				if(pOutSquaredDistances[j] > pOutSquaredDistances[*pnWorstNeighbor])					*pnWorstNeighbor = j;			}		}	}}void GNeighborFinder::FindNeighbors(int* pOutNeighbors, double* pOutSquaredDistances, int nNeighbors, double* pVector, int nExclude){	// Start with no neighbors	int i;	for(i = 0; i < nNeighbors; i++)	{		pOutNeighbors[i] = -1;		pOutSquaredDistances[i] = 1e200;	}	int nWorstNeighbor = 0;	// Start with the current cell	int nInputs = m_pRelation->GetInputCount();	GNeighborFinderLeaf* pMinCell = FindCell(pVector);	GNeighborFinderLeaf* pMaxCell = pMinCell;	GetNeighborsFromCell(pMinCell, pVector, pOutNeighbors, pOutSquaredDistances, nNeighbors, &nWorstNeighbor, nExclude);	// Search outward until we are guaranteed to have found the best neighbors	double d;	while(true)	{		// Find the nearest moveable boundary		int index;		int nClosest = -1;		bool bGreater = false;		double dClosest = 1e200;		for(i = 0; i < nInputs; i++)		{			index = m_pRelation->GetInputIndex(i);			if(pMinCell->GetLesser(i))			{				d = pMinCell->GetMin(i) - pVector[index];				d *= d;				if(d < dClosest)				{					nClosest = i;					bGreater = false;					dClosest = d;				}			}			if(pMaxCell->GetGreater(i))			{				d = pMaxCell->GetMax(i) - pVector[index];				d *= d;				if(d < dClosest)				{					nClosest = i;					bGreater = true;					dClosest = d;				}			}		}		// See if we're guaranteed to have the best neighbors yet		if(nClosest < 0 || dClosest >= pOutSquaredDistances[nWorstNeighbor])			break;		// Advance the boundary		if(bGreater)		{			for(i = 0; i < nInputs; i++)			{				m_ppIterators[i] = pMaxCell->GetGreater(nClosest);				m_pMaxs[i] = pMinCell->GetMin(i);			}			m_pMaxs[nClosest] = m_ppIterators[0]->GetMin(nClosest);			while(true)			{				GetNeighborsFromCell(m_ppIterators[0], pVector, pOutNeighbors, pOutSquaredDistances, nNeighbors, &nWorstNeighbor, nExclude);				for(i = 0; i < nInputs; i++)				{					if(m_ppIterators[i]->GetMin(i) > m_pMaxs[i])					{						m_ppIterators[i] = m_ppIterators[i]->GetLesser(i);						break;					}				}				if(i >= nInputs)					break;				for(i--; i >= 0; i--)					m_ppIterators[i] = m_ppIterators[i + 1];			}			pMaxCell = pMaxCell->GetGreater(nClosest);		}		else		{			for(i = 0; i < nInputs; i++)			{				m_ppIterators[i] = pMinCell->GetLesser(nClosest);				m_pMaxs[i] = pMaxCell->GetMax(i);			}			m_pMaxs[nClosest] = m_ppIterators[0]->GetMax(nClosest);			while(true)			{				GetNeighborsFromCell(m_ppIterators[0], pVector, pOutNeighbors, pOutSquaredDistances, nNeighbors, &nWorstNeighbor, nExclude);				for(i = 0; i < nInputs; i++)				{					if(m_ppIterators[i]->GetMax(i) < m_pMaxs[i])					{						m_ppIterators[i] = m_ppIterators[i]->GetGreater(i);						break;					}				}				if(i >= nInputs)					break;				for(i--; i >= 0; i--)					m_ppIterators[i] = m_ppIterators[i + 1];			}			pMinCell = pMinCell->GetLesser(nClosest);		}	}}// -------------------------------------------------------------------------------GKNN::GKNN(GArffRelation* pRelation, int nNeighbors, bool bCopyInstances) : GSupervisedLearner(pRelation){	m_eInterpolationMethod = Linear;	m_pLearner = NULL;	m_bOwnLearner = false;	m_bCopyInstances = bCopyInstances;	m_nNeighbors = nNeighbors;	m_pInstances = NULL;	m_pScaleFactors = NULL;	m_pEvalVector = NULL;	m_pNeighborFinder = NULL;	m_pEvalNeighbors = NULL;	m_pEvalDistances = NULL;	Reset();}GKNN::~GKNN(){	delete(m_pNeighborFinder);	if(!m_bCopyInstances)		m_pInstances->DropAllVectors();	delete(m_pInstances);	delete[] m_pScaleFactors;	delete[] m_pEvalVector;	delete[] m_pEvalNeighbors;	delete[] m_pEvalDistances;}void GKNN::SetInterpolationMethod(InterpolationMethod eMethod){	GAssert(eMethod != Learner, "Call SetInterpolationLearner instead");	m_eInterpolationMethod = eMethod;}void GKNN::SetInterpolationLearner(GSupervisedLearner* pLearner, bool bOwnLearner){	if(m_bOwnLearner)		delete(m_pLearner);	GAssert(pLearner->GetRelation() == GetRelation(), "Expected the learner to be constructed with the same relation as this object");	m_pLearner = pLearner;	m_eInterpolationMethod = Learner;	m_bOwnLearner = bOwnLearner;}void GKNN::ComputeScaleFactors(GArffData* pData){	GAssert(m_pInstances->GetSize() <= 0, "You should only compute scale factors before you add any data");	if(!m_bCopyInstances)		return; // Can't scale vectors we don't own/*	int nInputs = m_pRelation->GetInputCount();	int nOutputs = m_pRelation->GetOutputCount();	int i, j, index;	GTEMPBUF(double, pOutputMeans, nOutputs);	for(j = 0; j < nOutputs; j++)		pOutputMeans[j] = pData->ComputeMean(m_pRelation->GetOutputIndex(j));	for(i = 0; i < nInputs; i++)	{		index = m_pRelation->GetInputIndex(i);		if(m_pRelation->GetAttribute(index)->IsContinuous())		{			double dInputMean = pData->ComputeMean(m_pRelation->GetInputIndex(i));			double d = 0;			for(j = 0; j < nOutputs; j++)				d = MAX(d, pData->ComputeCovariance(m_pRelation->GetInputIndex(i), dInputMean, m_pRelation->GetOutputIndex(j), pOutputMeans[j]));			m_pScaleFactors[i] = d;		}		else			m_pScaleFactors[i] = 1;	}*/}void GKNN::AddVector(double* pVector){	double* pVec;	if(m_bCopyInstances)	{		// Make a copy of the vector		int nAttributes = m_pRelation->GetAttributeCount();		pVec = new double[nAttributes];		memcpy(pVec, pVector, sizeof(double) * nAttributes);		// Scale the copy		int i;		int nInputs = m_pRelation->GetInputCount();		for(i = 0; i < nInputs; i++)			pVec[m_pRelation->GetInputIndex(i)] *= m_pScaleFactors[i];	}	else		pVec = pVector;	// Add it to the known instances	if(m_pNeighborFinder)		m_pNeighborFinder->AddVector(pVec); // GNeighborFinder has a pointer to m_pInstances, and it will add pVec to m_pInstances when we make this call	else		m_pInstances->AddPointer(pVec);}void GKNN::AddVectorAndDeleteNeighborIfClose(double* pVector, double dCloseDistance){	double* pVec;	if(m_bCopyInstances)	{		// Make a copy of the vector		int nAttributes = m_pRelation->GetAttributeCount();		pVec = new double[nAttributes];		memcpy(pVec, pVector, sizeof(double) * nAttributes);		// Scale the copy		int i;		int nInputs = m_pRelation->GetInputCount();		for(i = 0; i < nInputs; i++)			pVec[m_pRelation->GetInputIndex(i)] *= m_pScaleFactors[i];	}	else		pVec = pVector;	// Delete the closest neighbor if it's closer than the specified threshold	if(!m_pNeighborFinder)		m_pNeighborFinder = new GNeighborFinder(m_pRelation, m_pInstances, m_pRelation->GetInputCount());	m_pNeighborFinder->FindNeighbors(m_pEvalNeighbors, m_pEvalDistances, 1, pVec, -1);	if(m_pEvalDistances[0] < dCloseDistance * dCloseDistance)	{		double* pClosest = m_pNeighborFinder->DropVector(m_pEvalNeighbors[0]);		if(m_bCopyInstances)			delete[] pClosest;	}	// Add the vector	m_pNeighborFinder->AddVector(pVec);}void GKNN::Train(GArffData* pData){	ComputeScaleFactors(pData);	int nVectors = pData->GetSize();	double* pVector;	int n;	for(n = 0; n < nVectors; n++)	{		pVector = pData->GetVector(n);		AddVector(pVector);	}}void GKNN::FindNeighbors(double* pVector){	int nInputs = m_pRelation->GetInputCount();	if(!m_pNeighborFinder)		m_pNeighborFinder = new GNeighborFinder(m_pRelation, m_pInstances, nInputs);	if(m_bCopyInstances)	{		memcpy(m_pEvalVector, pVector, sizeof(double) * m_pRelation->GetAttributeCount());		int i;		for(i = 0; i < nInputs; i++)			m_pEvalVector[m_pRelation->GetInputIndex(i)] *= m_pScaleFactors[i];		m_pNeighborFinder->FindNeighbors(m_pEvalNeighbors, m_pEvalDistances, m_nNeighbors, m_pEvalVector, -1);	}	else		m_pNeighborFinder->FindNeighbors(m_pEvalNeighbors, m_pEvalDistances, m_nNeighbors, pVector, -1);}void GKNN::InterpolateMean(double* pVector){	int nOutputs = m_pRelation->GetOutputCount();	GArffAttribute* pAttr;	int i, j, k, index;	double* pVectorNeighbor;	for(i = 0; i < nOutputs; i++)	{		index = m_pRelation->GetOutputIndex(i);		pAttr = m_pRelation->GetAttribute(index);		if(pAttr->IsContinuous())		{			double dSum = 0;			for(j = 0; j < m_nNeighbors; j++)			{				k = m_pEvalNeighbors[j];				if(k >= 0)				{					pVectorNeighbor = m_pInstances->GetVector(k);					dSum += pVectorNeighbor[index];				}			}			pVector[index] = dSum / m_nNeighbors;		}		else		{			int nValueCount = pAttr->GetValueCount();			int* pCounts = new int[nValueCount];			Holder<int*> hCounts(pCounts);			memset(pCounts, '\0', sizeof(int) * nValueCount);			for(j = 0; j < m_nNeighbors; j++)			{				k = m_pEvalNeighbors[j];				if(k >= 0)				{					pVectorNeighbor = m_pInstances->GetVector(k);					pCounts[(int)pVectorNeighbor[index]]++;				}			}			k = 0;			for(j = 1; j < nValueCount; j++)			{				if(pCounts[j] > pCounts[k])					k = j;			}			pVector[index] = (double)k;		}	}}void GKNN::InterpolateLinear(double* pVector){	int nOutputs = m_pRelation->GetOutputCount();	GArffAttribute* pAttr;	int i, j, k, index;	double d;	double* pVectorNeighbor;	for(i = 0; i < nOutputs; i++)	{		index = m_pRelation->GetOutputIndex(i);		pAttr = m_pRelation->GetAttribute(index);		if(pAttr->IsContinuous())		{			double dSum = 0;			double dTot = 0;			for(j = 0; j < m_nNeighbors; j++)			{				k = m_pEvalNeighbors[j];				if(k >= 0)				{					pVectorNeighbor = m_pInstances->GetVector(k);					d = m_pEvalDistances[j];					d = 1.0 / MAX(d, 1e-9); // to be truly "linear", we should use sqrt(d) instead of d, but this is faster to compute					dSum += d * pVectorNeighbor[index];					dTot += d;				}			}			if(dTot > 0)				pVector[index] = dSum / dTot;			else				pVector[index] = dSum;		}		else		{			int nValueCount = pAttr->GetValueCount();			double* pScores = new double[nValueCount];			ArrayHolder<double*> hScores(pScores);			for(j = 0; j < nValueCount; j++)				pScores[j] = 0;			for(j = 0; j < m_nNeighbors; j++)			{				k = m_pEvalNeighbors[j];				if(k >= 0)				{					pVectorNeighbor = m_pInstances->GetVector(k);					d = m_pEvalDistances[j];					d = 1 / (d + .00001);					pScores[(int)pVectorNeighbor[index]] += d;				}			}			pVector[index] = (double)GVector::FindMax(pScores, nValueCount);		}	}}void GKNN::InterpolateLearner(double* pVector){	GAssert(m_pLearner, "no learner is set");	m_pLearner->Reset();	int i, nNeighbor;	GArffData data(m_nNeighbors);	for(i = 0; i < m_nNeighbors; i++)	{		nNeighbor = m_pEvalNeighbors[i];		if(nNeighbor >= 0)			data.AddVector(m_pInstances->GetVector(nNeighbor));	}	m_pLearner->Train(&data);	data.DropAllVectors();	m_pLearner->Eval(pVector);}// virtualvoid GKNN::Eval(double* pVector){	FindNeighbors(pVector);	switch(m_eInterpolationMethod)	{		case Linear:			InterpolateLinear(pVector);			break;		case Mean:			InterpolateMean(pVector);			break;		case Learner:			InterpolateLearner(pVector);			break;		default:			GAssert(false, "unexpected enumeration");			break;	}}// virtualvoid GKNN::Reset(){	if(m_pInstances)	{		if(!m_bCopyInstances)			m_pInstances->DropAllVectors();		delete(m_pInstances);	}	m_pInstances = new GArffData(1024);	delete[] m_pScaleFactors;	delete[] m_pEvalVector;	int nInputs = m_pRelation->GetInputCount();	m_pScaleFactors = new double[nInputs];	m_pEvalVector = new double[m_pRelation->GetAttributeCount()];	int n;	for(n = 0; n < nInputs; n++)		m_pScaleFactors[n] = 1;	delete(m_pNeighborFinder);	m_pNeighborFinder = NULL;	delete[] m_pEvalNeighbors;	delete[] m_pEvalDistances;	m_pEvalNeighbors = new int[m_nNeighbors];	m_pEvalDistances = new double[m_nNeighbors];}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?