📄 gknn.cpp
字号:
d = rel.ComputeInputDistanceSquared(pVector, data.GetVector(pNeighbors[j])); if(d > dWorstDistance) { dWorstDistance = d; nWorstNeighbor = j; } } for(j = 0; j < nPoints; j++) { pVector2 = data.GetVector(j); d = rel.ComputeInputDistanceSquared(pVector, pVector2); if(d == 0) { } else if(d < dWorstDistance) { for(k = 0; k < nNeighbors; k++) { if(k > 0) { // This isn't a complete check, but it should be good enough if(pNeighbors[k] == pNeighbors[k - 1]) throw "same neighbor twice"; if(pNeighbors[k] == pNeighbors[0]) throw "same neighbor twice"; } if(pNeighbors[k] == i) throw "failed to exclude itself"; if(data.GetVector(pNeighbors[k]) == pVector2) break; } if(k >= nNeighbors) throw "missed a neighbor"; } } }}#endif // !NO_TEST_CODEGNeighborFinderLeaf* GNeighborFinder::FindCell(double* pVector){ // Find the left node to hold the vector GNeighborFinderNode* pNode = m_pRoot; while(true) { if(pNode->IsLeaf()) return (GNeighborFinderLeaf*)pNode; else { GNeighborFinderInterior* pInterior = (GNeighborFinderInterior*)pNode; if(pVector[m_pRelation->GetInputIndex(pInterior->GetInput())] >= pInterior->GetPivot()) pNode = pInterior->GetGreater(); else pNode = pInterior->GetLesser(); } }}void GNeighborFinder::GetNeighborsFromCell(GNeighborFinderLeaf* pCell, double* pVector, int* pOutNeighbors, double* pOutSquaredDistances, int nNeighbors, int* pnWorstNeighbor, int nExclude){ GIntArray* pCellData = pCell->GetData(); int nCount = pCellData->GetSize(); int i, j, index; double d; for(i = 0; i < nCount; i++) { index = pCellData->GetInt(i); if(index == nExclude) continue; double* pCandidate = m_pData->GetVector(index); d = m_pRelation->ComputeInputDistanceSquared(pCandidate, pVector); if(d < pOutSquaredDistances[*pnWorstNeighbor]) { pOutNeighbors[*pnWorstNeighbor] = index; pOutSquaredDistances[*pnWorstNeighbor] = d; *pnWorstNeighbor = 0; for(j = 1; j < nNeighbors; j++) // todo: a priority queue could make this faster when the number of neighbors is large { if(pOutSquaredDistances[j] > pOutSquaredDistances[*pnWorstNeighbor]) *pnWorstNeighbor = j; } } }}void GNeighborFinder::FindNeighbors(int* pOutNeighbors, double* pOutSquaredDistances, int nNeighbors, double* pVector, int nExclude){ // Start with no neighbors int i; for(i = 0; i < nNeighbors; i++) { pOutNeighbors[i] = -1; pOutSquaredDistances[i] = 1e200; } int nWorstNeighbor = 0; // Start with the current cell int nInputs = m_pRelation->GetInputCount(); GNeighborFinderLeaf* pMinCell = FindCell(pVector); GNeighborFinderLeaf* pMaxCell = pMinCell; GetNeighborsFromCell(pMinCell, pVector, pOutNeighbors, pOutSquaredDistances, nNeighbors, &nWorstNeighbor, nExclude); // Search outward until we are guaranteed to have found the best neighbors double d; while(true) { // Find the nearest moveable boundary int index; int nClosest = -1; bool bGreater = false; double dClosest = 1e200; for(i = 0; i < nInputs; i++) { index = m_pRelation->GetInputIndex(i); if(pMinCell->GetLesser(i)) { d = pMinCell->GetMin(i) - pVector[index]; d *= d; if(d < dClosest) { nClosest = i; bGreater = false; dClosest = d; } } if(pMaxCell->GetGreater(i)) { d = pMaxCell->GetMax(i) - pVector[index]; d *= d; if(d < dClosest) { nClosest = i; bGreater = true; dClosest = d; } } } // See if we're guaranteed to have the best neighbors yet if(nClosest < 0 || dClosest >= pOutSquaredDistances[nWorstNeighbor]) break; // Advance the boundary if(bGreater) { for(i = 0; i < nInputs; i++) { m_ppIterators[i] = pMaxCell->GetGreater(nClosest); m_pMaxs[i] = pMinCell->GetMin(i); } m_pMaxs[nClosest] = m_ppIterators[0]->GetMin(nClosest); while(true) { GetNeighborsFromCell(m_ppIterators[0], pVector, pOutNeighbors, pOutSquaredDistances, nNeighbors, &nWorstNeighbor, nExclude); for(i = 0; i < nInputs; i++) { if(m_ppIterators[i]->GetMin(i) > m_pMaxs[i]) { m_ppIterators[i] = m_ppIterators[i]->GetLesser(i); break; } } if(i >= nInputs) break; for(i--; i >= 0; i--) m_ppIterators[i] = m_ppIterators[i + 1]; } pMaxCell = pMaxCell->GetGreater(nClosest); } else { for(i = 0; i < nInputs; i++) { m_ppIterators[i] = pMinCell->GetLesser(nClosest); m_pMaxs[i] = pMaxCell->GetMax(i); } m_pMaxs[nClosest] = m_ppIterators[0]->GetMax(nClosest); while(true) { GetNeighborsFromCell(m_ppIterators[0], pVector, pOutNeighbors, pOutSquaredDistances, nNeighbors, &nWorstNeighbor, nExclude); for(i = 0; i < nInputs; i++) { if(m_ppIterators[i]->GetMax(i) < m_pMaxs[i]) { m_ppIterators[i] = m_ppIterators[i]->GetGreater(i); break; } } if(i >= nInputs) break; for(i--; i >= 0; i--) m_ppIterators[i] = m_ppIterators[i + 1]; } pMinCell = pMinCell->GetLesser(nClosest); } }}// -------------------------------------------------------------------------------GKNN::GKNN(GArffRelation* pRelation, int nNeighbors) : GSupervisedLearner(pRelation){ m_nNeighbors = nNeighbors; m_pRows = new GArffData(1024); int nInputs = m_pRelation->GetInputCount(); m_pScaleFactors = new double[nInputs]; m_pEvalVector = new double[m_pRelation->GetAttributeCount()]; int n; for(n = 0; n < nInputs; n++) m_pScaleFactors[n] = 1; m_pNeighborFinder = NULL; m_pEvalNeighbors = new int[nNeighbors]; m_pEvalDistances = new double[nNeighbors];}GKNN::~GKNN(){ delete(m_pNeighborFinder); delete(m_pRows); delete[] m_pScaleFactors; delete[] m_pEvalVector; delete[] m_pEvalNeighbors; delete[] m_pEvalDistances;}void GKNN::ComputeScaleFactors(GArffData* pData){ GAssert(m_pRows->GetSize() <= 0, "You should compute scale factors before you add any data"); int nInputs = m_pRelation->GetInputCount(); double dMean, dVariance; int i, index; for(i = 0; i < nInputs; i++) { index = m_pRelation->GetInputIndex(i); if(m_pRelation->GetAttribute(index)->IsContinuous()) { dMean = pData->ComputeMean(index); dVariance = pData->ComputeVariance(dMean, index); m_pScaleFactors[i] = sqrt(dVariance); } else m_pScaleFactors[i] = 1; }}void GKNN::AddVector(double* pRow){ delete(m_pNeighborFinder); m_pNeighborFinder = NULL; int nAttributes = m_pRelation->GetAttributeCount(); int nInputs = m_pRelation->GetInputCount(); double* pData = new double[nAttributes]; m_pRows->AddPointer(pData); memcpy(pData, pRow, sizeof(double) * nAttributes); int i; for(i = 0; i < nInputs; i++) pData[m_pRelation->GetInputIndex(i)] *= m_pScaleFactors[i];}void GKNN::Train(GArffData* pData){ ComputeScaleFactors(pData); int nRows = pData->GetSize(); double* pRow; int n; for(n = 0; n < nRows; n++) { pRow = pData->GetVector(n); AddVector(pRow); }}void GKNN::FindNeighbors(double* pRow){ int nInputs = m_pRelation->GetInputCount(); if(!m_pNeighborFinder) m_pNeighborFinder = new GNeighborFinder(m_pRelation, m_pRows, nInputs); memcpy(m_pEvalVector, pRow, sizeof(double) * m_pRelation->GetAttributeCount()); int i; for(i = 0; i < nInputs; i++) m_pEvalVector[m_pRelation->GetInputIndex(i)] *= m_pScaleFactors[i]; m_pNeighborFinder->FindNeighbors(m_pEvalNeighbors, m_pEvalDistances, m_nNeighbors, m_pEvalVector, -1);}void GKNN::EvalEqualWeight(double* pRow){ FindNeighbors(pRow); int nOutputs = m_pRelation->GetOutputCount(); GArffAttribute* pAttr; int i, j, index; double* pRowNeighbor; for(i = 0; i < nOutputs; i++) { index = m_pRelation->GetOutputIndex(i); pAttr = m_pRelation->GetAttribute(index); if(pAttr->IsContinuous()) { double dSum = 0; for(j = 0; j < m_nNeighbors; j++) { pRowNeighbor = (double*)m_pRows->GetPointer(m_pEvalNeighbors[j]); dSum += pRowNeighbor[index]; } pRow[index] = dSum / m_nNeighbors; } else { GVector v(pAttr->GetValueCount()); for(j = 0; j < m_nNeighbors; j++) { pRowNeighbor = (double*)m_pRows->GetPointer(m_pEvalNeighbors[j]); v.Add((int)pRowNeighbor[index], 1); } pRow[index] = (double)v.GetIndexOfBiggestValue(); } }}void GKNN::EvalLinearWeight(double* pRow){ FindNeighbors(pRow); int nOutputs = m_pRelation->GetOutputCount(); GArffAttribute* pAttr; int i, j, index; double d; double* pRowNeighbor; for(i = 0; i < nOutputs; i++) { index = m_pRelation->GetOutputIndex(i); pAttr = m_pRelation->GetAttribute(index); if(pAttr->IsContinuous()) { double dSum = 0; double dTot = 0; for(j = 0; j < m_nNeighbors; j++) { pRowNeighbor = (double*)m_pRows->GetPointer(m_pEvalNeighbors[j]); d = m_pRelation->ComputeInputDistanceSquared(pRow, pRowNeighbor); d = 1 / (d + .0001); dSum += d * pRowNeighbor[index]; dTot += d; } pRow[index] = dSum / dTot; } else { GVector v(pAttr->GetValueCount()); for(j = 0; j < m_nNeighbors; j++) { pRowNeighbor = (double*)m_pRows->GetPointer(m_pEvalNeighbors[j]); d = m_pRelation->ComputeInputDistanceSquared(pRow, pRowNeighbor); d = 1 / (d + .001); v.Add((int)pRowNeighbor[index], d); } pRow[index] = (double)v.GetIndexOfBiggestValue(); } }}double* GKNN::DropRow(int nRow){ int nCount = m_pRows->GetSize(); double* pRow = (double*)m_pRows->GetPointer(nRow); m_pRows->SetPointer(nRow, m_pRows->GetPointer(nCount - 1)); m_pRows->DeleteCell(nCount - 1); return pRow;}int GKNN::FindLeastHelpfulRow(GArffData* pTestSet){ double dBestError = 1e100; int nBestIndex = 0; int n; double d; for(n = m_pRows->GetSize() - 1; n >= 0; n--) { double* pTemp = DropRow(n); d = MeasureMeanSquaredError(pTestSet); m_pRows->AddPointer(pTemp); if(d < dBestError) { dBestError = d; nBestIndex = n; } } return nBestIndex;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -