garff.cpp
来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 1,909 行 · 第 1/3 页
CPP
1,909 行
dVal = pRow[nIndex]; for(n = 1; n < nRowCount; n++) { pRow = GetVector(n); if(pRow[nIndex] != dVal) return false; } } else { for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[nIndex]; if(nVal >= 0) { n++; break; } } for( ; n < nRowCount; n++) { pRow = GetVector(n); nTmp = (int)pRow[nIndex]; if(nTmp != nVal && nTmp >= 0) return false; } } } return true;}void GArffData::RandomlyReplaceMissingData(GArffRelation* pRelation){ int n, i, j; int nRowCount = GetSize(); int nAttrCount = pRelation->GetAttributeCount(); int nMaxValues = 0; int nValues; int nVal; int nSum; int nRand; int* pCounts = NULL; double* pRow; GArffAttribute* pAttr; for(i = 0; i < nAttrCount; i++) { // Make a buffer to hold the counts pAttr = pRelation->GetAttribute(i); if(pAttr->IsContinuous()) continue; nValues = pAttr->GetValueCount(); if(nValues > nMaxValues) { delete(pCounts); nMaxValues = pAttr->GetValueCount() + 3; pCounts = new int[nMaxValues]; } // Count the number of each value memset(pCounts, '\0', sizeof(int) * nValues); for(n = 0; n < nRowCount; n++) { nVal = (int)GetVector(n)[i]; if(nVal >= 0) { GAssert(nVal < nValues, "out of range"); pCounts[nVal]++; } else { GAssert(nVal == -1, "out of range"); } } // Sum the value counts nSum = 0; for(n = 0; n < nValues; n++) nSum += pCounts[n]; // Replace the missing values for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[i]; if(nVal < 0) { nRand = (int)(GBits::GetRandomUint() % nSum); for(j = 0; ; j++) { GAssert(j < nValues, "internal inconsistency"); nRand -= pCounts[j]; if(nRand < 0) { pRow[i] = (double)j; break; } } } } }}void GArffData::ReplaceMissingAttributeWithMostCommonValue(GArffRelation* pRelation, int nAttribute){ GArffAttribute* pAttr = pRelation->GetAttribute(nAttribute); if(pAttr->IsContinuous()) return; // missing values are currently only supported for discreet values int nValues = pAttr->GetValueCount(); GTEMPBUF(int, pCounts, nValues); memset(pCounts, '\0', sizeof(int) * nValues); double* pRow; int nRowCount = GetSize(); int n, nVal; for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[nAttribute]; if(nVal < 0) continue; GAssert(nVal < nValues, "out of range"); pCounts[nVal]++; } int nBest = 0; for(n = 1; n < nValues; n++) { if(pCounts[n] > pCounts[nBest]) nBest = n; } for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); nVal = (int)pRow[nAttribute]; if(nVal < 0) { pRow[nAttribute] = (double)nBest; } }}void GArffData::Print(int nAttributes){ int nRows = GetSize(); double* pRow; int n, i; for(n = 0; n < nRows; n++) { pRow = GetVector(n); printf("%f", pRow[0]); for(i = 1; i < nAttributes; i++) printf("\t%f", pRow[i]); printf("\n"); }}int ComputeMinimumVariancePivotComparer(void* pThis, void* pA, void* pB){ int nAttr = *(int*)pThis; double* pdA = (double*)pA; double* pdB = (double*)pB; if(pdA[nAttr] >= pdB[nAttr]) return 1; else return -1;}double GArffData::ComputeMinimumVariancePivot(int nAttr){ int nRows = GetSize(); GPointerArray arr(nRows); int n; for(n = 0; n < nRows; n++) arr.AddPointer(GetVector(n)); arr.Sort(ComputeMinimumVariancePivotComparer, &nAttr); double dBestPivotScore = 1e100; double dBestPivot = 0; double dPivot, d; double* pRow1; double* pRow2; double dMean1, dMean2, dVar1, dVar2; int nCount1, nCount2, i; for(n = nRows - 2; n >= 0; n--) { // Try a pivot pRow1 = (double*)arr.GetPointer(n); pRow2 = (double*)arr.GetPointer(n + 1); dPivot = (pRow1[nAttr] + pRow2[nAttr]) / 2; // Compute the mean of each half dMean1 = 0; dMean2 = 0; nCount1 = 0; nCount2 = 0; for(i = 0; i < nRows; i++) { pRow1 = GetVector(i); if(pRow1[nAttr] < dPivot) { nCount1++; dMean1 += pRow1[nAttr]; } else { nCount2++; dMean2 += pRow1[nAttr]; } } dMean1 /= nCount1; dMean2 /= nCount2; // Compute the variance of each half dVar1 = 0; dVar2 = 0; for(i = 0; i < nRows; i++) { pRow1 = GetVector(i); if(pRow1[nAttr] < dPivot) { d = pRow1[nAttr] - dMean1; dVar1 += (d * d); } else { d = pRow2[nAttr] - dMean2; dVar2 += (d * d); } } dVar1 /= nCount1; dVar2 /= nCount2; d = dVar1 + dVar2; // See if we've got a new best score if(d < dBestPivotScore) { dBestPivotScore = d; dBestPivot = dPivot; } } return dBestPivot;}bool GArffData::PickPivotToReduceInfo(double* pOutPivot, double* pOutputInfo, GArffRelation* pRelation, int nAttr){ int nRows = GetSize(); int n; double dBestPivotScore = 1e100; double dPivot, d; double* pRow1; double* pRow2; bool bGotOne = false; for(n = 0; n < 10; n++) { // Try a pivot pRow1 = GetVector(rand() % nRows); pRow2 = GetVector(rand() % nRows); dPivot = (pRow1[nAttr] + pRow2[nAttr]) / 2; // Split at the pivot and measure the sum info GArffData* pData2 = SplitByPivot(nAttr, dPivot); if(GetSize() > 0 && pData2->GetSize() > 0) { d = (pRelation->MeasureTotalOutputInfo(this) * GetSize() + pRelation->MeasureTotalOutputInfo(pData2) * pData2->GetSize()) / (double)(GetSize() + pData2->GetSize()); // See if we've got a new best score if(d < dBestPivotScore) { dBestPivotScore = d; *pOutPivot = dPivot; bGotOne = true; } } Merge(pData2); delete(pData2); } *pOutputInfo = dBestPivotScore; return bGotOne;}void GArffData::ComputePrincipleComponent(int nDims, double* pOutVector, int nIterations, bool bExtract){ // Initialize the out-vector to a random direction and compute the mean int i, j, n; double* pMean = new double[2 * nDims]; Holder<double*> hMean(pMean); for(j = 0; j < nDims; j++) { pOutVector[j] = GBits::GetRandomDouble(); pMean[j] = ComputeMean(j); } GVector::Normalize(pOutVector, nDims); // Translate the data such that the mean is at the origin double* pVector; int nCount = GetSize(); for(n = 0; n < nCount; n++) { pVector = GetVector(n); for(j = 0; j < nDims; j++) pVector[j] -= pMean[j]; } // Iterate double* pAccumulator = pMean + nDims; double d; for(i = 0; i < nIterations; i++) { for(j = 0; j < nDims; j++) pAccumulator[j] = 0; for(n = 0; n < nCount; n++) { pVector = GetVector(n); d = GVector::ComputeDotProduct(pVector, pOutVector, nDims); for(j = 0; j < nDims; j++) pAccumulator[j] += pVector[j] * d; } GVector::Normalize(pAccumulator, nDims); //if(GVector::ComputeSquaredDistance(pAccumulator, pOutVector, nDims) < 1e-18) // break; memcpy(pOutVector, pAccumulator, sizeof(double) * nDims); } // Normalize GVector::Normalize(pOutVector, nDims); // Optionally remove the component if(bExtract) { for(i = 0; i < nCount; i++) { pVector = GetVector(i); d = GVector::ComputeDotProduct(pVector, pOutVector, nDims); for(j = 0; j < nDims; j++) pVector[j] -= d * pOutVector[j]; } } // Restore the data to its original position for(n = 0; n < nCount; n++) { pVector = GetVector(n); for(j = 0; j < nDims; j++) pVector[j] += pMean[j]; }}double GArffData::ComputeCovariance(int nAttr1, double dMean1, int nAttr2, double dMean2){ int nRowCount = GetSize(); double* pVector; double dSum = 0; int i; for(i = 0; i < nRowCount; i++) { pVector = GetVector(i); dSum += ((pVector[nAttr1] - dMean1) * (pVector[nAttr2] - dMean2)); } return dSum / (nRowCount - 1); // todo: why do we subtract one here? Is that ALWAYS the right thing to do?}void GArffData::ComputeCovarianceMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation){ // Resize the matrix int nInputs = pRelation->GetInputCount(); pOutMatrix->Resize(nInputs, nInputs); // Compute the deviations Holder<double*> hMeans(new double[nInputs]); double* pMeans = hMeans.Get(); int nRowCount = GetSize(); double* pRow; int n, i, nIndex; for(i = 0; i < nInputs; i++) { nIndex = pRelation->GetInputIndex(i); // Compute the mean double dSum = 0; for(n = 0; n < nRowCount; n++) { pRow = GetVector(n); dSum += pRow[nIndex]; } pMeans[i] = dSum / nRowCount; } // Compute the covariances for half the matrix for(i = 0; i < nInputs; i++) { for(n = i; n < nInputs; n++) pOutMatrix->Set(i, n, ComputeCovariance(pRelation->GetInputIndex(i), pMeans[i], pRelation->GetInputIndex(n), pMeans[n])); } // Fill out the other half of the matrix for(i = 1; i < nInputs; i++) { for(n = 0; n < i; n++) pOutMatrix->Set(i, n, pOutMatrix->Get(n, i)); }}void GArffData::ComputeCoprobabilityMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation, int nAttr, double noDataValue){ // Resize the matrix GArffAttribute* pAttr = pRelation->GetAttribute(nAttr); int nRows = pAttr->GetValueCount(); int nAttributes = pRelation->GetAttributeCount(); int nCols = 0; int i; for(i = 0; i < nAttributes; i++) { GArffAttribute* pAttrCol = pRelation->GetAttribute(i); nCols += pAttrCol->GetValueCount(); } pOutMatrix->Resize(nRows, nCols); // Compute the coprobabilities int nRowCount = GetSize(); int row, col, nMatch, nTotal, nAttrCol, nVal; double* pRow; for(row = 0; row < nRows; row++) { col = 0; for(nAttrCol = 0; nAttrCol < nAttributes; nAttrCol++) { GArffAttribute* pAttrCol = pRelation->GetAttribute(nAttrCol); for(nVal = 0; nVal < pAttrCol->GetValueCount(); nVal++) { nMatch = 0; nTotal = 0; for(i = 0; i < nRowCount; i++) { pRow = GetVector(i); if((int)pRow[nAttrCol] == nVal) { nTotal++; if((int)pRow[nAttr] == row) nMatch++; } } if(nTotal == 0) pOutMatrix->Set(row, col, noDataValue); else pOutMatrix->Set(row, col, (double)nMatch / nTotal); col++; } } GAssert(col == nCols, "problem with columns"); }}int DimensionComparer(void* pThis, void* pA, void* pB){ int nDim = *(int*)pThis; if(((double*)pA)[nDim] < ((double*)pB)[nDim]) return -1; else if(((double*)pA)[nDim] > ((double*)pB)[nDim]) return 1; else return 0;}void GArffData::Sort(int nDimension){ GPointerArray::Sort(DimensionComparer, &nDimension);}/*GArffData* GArffData::SlowFourierTransform(GArffRelation* pRel, bool bForward){ int nCount = GetSize(); GAssert(nCount > 1, "Must have at least two points"); GArffData* pTransformed = new GArffData(nCount); double dSumReal, dSumImag, dTwidReal, dTwidImag, dR, dI, dTmp; double* pVec1; double* pVec2; double* pOutputVector; int nInputCount = pRel->GetInputCount(); int nOutputCount = pRel->GetOutputCount(); GAssert((nOutputCount & 1) == 0, "Expected an even number of outputs. Even=Real, Odd=Imag"); int i, j, nInput, nOutput, indexReal, indexImag, inputIndex; double dCircum = bForward ? -2.0 * PI : 2.0 * PI; // Compute the mins and ranges double* pMinsAndRanges = new double[nInputCount * 2]; ArrayHolder<double*> hMinsAndRanges(pMinsAndRanges); for(nInput = 0; nInput < nInputCount; nInput++) { GetMinAndRange(pRel->GetInputIndex(nInput), &pMinsAndRanges[2 * nInput], &pMinsAndRanges[2 * nInput + 1]); pMinsAndRanges[2 * nInput + 1] += (pMinsAndRanges[2 * nInput + 1] / (nCount - 1)); } // Perform the transform for(i = 0; i < nCount; i++) { pOutputVector = new double[nInputCount + nOutputCount]; pVec1 = GetVector(i); for(nInput = 0; nInput < nInputCount; nInput++) { j = pRel->GetInputIndex(nInput); pOutputVector[j] = pVec1[j]; } for(nOutput = 0; nOutput < nOutputCount; nOutput += 2) { indexReal = pRel->GetOutputIndex(nOutput); indexImag = pRel->GetOutputIndex(nOutput + 1); dSumReal = 0; dSumImag = 0; for(j = 0; j < nCount; j++) { pVec2 = GetVector(j); dTwidReal = 1; dTwidImag = 0; for(nInput = 0; nInput < nInputCount; nInput++) { inputIndex = pRel->GetInputIndex(nInput); dTmp = dCircum * (pVec1[inputIndex] - pMinsAndRanges[2 * nInput]) * (pVec2[inputIndex] - pMinsAndRanges[2 * nInput]) / pMinsAndRanges[2 * nInput + 1]; dR = cos(dTmp); dI = sin(dTmp); dTmp = dTwidReal; dTwidReal = dTwidReal * dR - dTwidImag * dI; dTwidImag = dTmp * dI + dR * dTwidImag; } dSumReal += dTwidReal * pVec2[indexReal] - dTwidImag * pVec2[indexImag]; dSumImag += dTwidReal * pVec2[indexImag] + dTwidImag * pVec2[indexReal]; } pOutputVector[indexReal] = dSumReal; pOutputVector[indexImag] = dSumImag; } pTransformed->AddVector(pOutputVector); } // Scale the reverse transform if(!bForward) { double dFactor = 1.0; for(nInput = 0; nInput < nInputCount; nInput++) dFactor /= pMinsAndRanges[2 * nInput + 1]; for(i = 0; i < nCount; i++) { pOutputVector = pTransformed->GetVector(i); for(nOutput = 0; nOutput < nOutputCount; nOutput++) pOutputVector[pRel->GetOutputIndex(nOutput)] *= dFactor; } } return pTransformed;}*/void GArffData::AddGaussianNoiseDimensions(GArffRelation* pRelation, int nNoiseDims){ int nOldAttributes = pRelation->GetAttributeCount(); int i, j; for(i = 0; i < nNoiseDims; i++) pRelation->AddAttribute(new GArffAttribute(true, 0, NULL)); int nNewAttributes = pRelation->GetAttributeCount(); for(i = 0; i < GetSize(); i++) { double* pOldVector = GetVector(i); double* pNewVector = new double[nNewAttributes]; memcpy(pNewVector, pOldVector, sizeof(double) * nOldAttributes); for(j = nOldAttributes; j < nNewAttributes; j++) pNewVector[j] = GBits::GetRandomGaussian(); SwapVector(i, pNewVector); delete[] pOldVector; }}#ifndef NO_TEST_CODE// staticvoid GArffData::Test(){ // Make some data GArffData data(100); int i; for(i = 0; i < 100; i++) { double* pNewVector = new double[2]; pNewVector[0] = GBits::GetRandomDouble(); pNewVector[1] = 2 * pNewVector[0]; data.AddVector(pNewVector); } // Find principle components double eig[2]; data.ComputePrincipleComponent(2, eig, 10, false); if(ABS(eig[0] * 2 - eig[1]) > .0001) throw "incorrect value"; // Compute principle components via eigenvectors of covariance matrix, and // make sure they're the same GArffRelation rel; rel.AddAttribute(new GArffAttribute(true, 0, NULL)); rel.AddAttribute(new GArffAttribute(true, 0, NULL)); GMatrix m; data.ComputeCovarianceMatrix(&m, &rel); GMatrix eigenVectors; eigenVectors.ComputeEigenVectors(1, &m); if(ABS(eigenVectors.Get(0, 0) * eigenVectors.Get(0, 1) - eig[0] * eig[1]) > .0001) throw "answers don't agree";/* // Test SlowFourierTransform GArffRelation rel2; rel2.AddAttribute(new GArffAttribute(true, 0, NULL)); rel2.AddAttribute(new GArffAttribute(false, 0, NULL)); rel2.AddAttribute(new GArffAttribute(false, 0, NULL)); GArffData data2(4); double* pVec; pVec = new double[3]; pVec[0] = 0; pVec[1] = 1; pVec[2] = 0; data2.AddVector(pVec); pVec = new double[3]; pVec[0] = 1.4; pVec[1] = 3; pVec[2] = 0; data2.AddVector(pVec); pVec = new double[3]; pVec[0] = 1.6; pVec[1] = 2; pVec[2] = 0; data2.AddVector(pVec); pVec = new double[3]; pVec[0] = 3; pVec[1] = 4; pVec[2] = 0; data2.AddVector(pVec); GArffData* pFourierData = data2.SlowFourierTransform(&rel2, true); for(i = 0; i < 4; i++) { pVec = pFourierData->GetVector(i); //pVec[0] = i; } GArffData* pRoundTripData = pFourierData->SlowFourierTransform(&rel2, false); for(i = 0; i < 4; i++) pVec = pRoundTripData->GetVector(i);*/}#endif // !NO_TEST_CODE
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?