garff.cpp

来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 1,909 行 · 第 1/3 页

CPP
1,909
字号
			dVal = pRow[nIndex];			for(n = 1; n < nRowCount; n++)			{				pRow = GetVector(n);				if(pRow[nIndex] != dVal)					return false;			}		}		else		{			for(n = 0; n < nRowCount; n++)			{				pRow = GetVector(n);				nVal = (int)pRow[nIndex];				if(nVal >= 0)				{					n++;					break;				}			}			for( ; n < nRowCount; n++)			{				pRow = GetVector(n);				nTmp = (int)pRow[nIndex];				if(nTmp != nVal && nTmp >= 0)					return false;			}		}	}	return true;}void GArffData::RandomlyReplaceMissingData(GArffRelation* pRelation){	int n, i, j;	int nRowCount = GetSize();	int nAttrCount = pRelation->GetAttributeCount();	int nMaxValues = 0;	int nValues;	int nVal;	int nSum;	int nRand;	int* pCounts = NULL;	double* pRow;	GArffAttribute* pAttr;	for(i = 0; i < nAttrCount; i++)	{		// Make a buffer to hold the counts		pAttr = pRelation->GetAttribute(i);		if(pAttr->IsContinuous())			continue;		nValues = pAttr->GetValueCount();		if(nValues > nMaxValues)		{			delete(pCounts);			nMaxValues = pAttr->GetValueCount() + 3;			pCounts = new int[nMaxValues];		}		// Count the number of each value		memset(pCounts, '\0', sizeof(int) * nValues);		for(n = 0; n < nRowCount; n++)		{			nVal = (int)GetVector(n)[i];			if(nVal >= 0)			{				GAssert(nVal < nValues, "out of range");				pCounts[nVal]++;			}			else			{				GAssert(nVal == -1, "out of range");			}		}		// Sum the value counts		nSum = 0;		for(n = 0; n < nValues; n++)			nSum += pCounts[n];		// Replace the missing values		for(n = 0; n < nRowCount; n++)		{			pRow = GetVector(n);			nVal = (int)pRow[i];			if(nVal < 0)			{				nRand = (int)(GBits::GetRandomUint() % nSum);				for(j = 0; ; j++)				{					GAssert(j < nValues, "internal inconsistency");					nRand -= pCounts[j];					if(nRand < 0)					{						pRow[i] = (double)j;						break;					}				}			}		}	}}void GArffData::ReplaceMissingAttributeWithMostCommonValue(GArffRelation* pRelation, int nAttribute){	GArffAttribute* pAttr = pRelation->GetAttribute(nAttribute);	if(pAttr->IsContinuous())		return; // missing values are currently only supported for discreet values	int nValues = pAttr->GetValueCount();	GTEMPBUF(int, pCounts, nValues);	memset(pCounts, '\0', sizeof(int) * nValues);	double* pRow;	int nRowCount = GetSize();	int n, nVal;	for(n = 0; n < nRowCount; n++)	{		pRow = GetVector(n);		nVal = (int)pRow[nAttribute];		if(nVal < 0)			continue;		GAssert(nVal < nValues, "out of range");		pCounts[nVal]++;	}	int nBest = 0;	for(n = 1; n < nValues; n++)	{		if(pCounts[n] > pCounts[nBest])			nBest = n;	}	for(n = 0; n < nRowCount; n++)	{		pRow = GetVector(n);		nVal = (int)pRow[nAttribute];		if(nVal < 0)		{			pRow[nAttribute] = (double)nBest;		}	}}void GArffData::Print(int nAttributes){	int nRows = GetSize();	double* pRow;	int n, i;	for(n = 0; n < nRows; n++)	{		pRow = GetVector(n);		printf("%f", pRow[0]);		for(i = 1; i < nAttributes; i++)			printf("\t%f", pRow[i]);		printf("\n");	}}int ComputeMinimumVariancePivotComparer(void* pThis, void* pA, void* pB){	int nAttr = *(int*)pThis;	double* pdA = (double*)pA;	double* pdB = (double*)pB;	if(pdA[nAttr] >= pdB[nAttr])		return 1;	else		return -1;}double GArffData::ComputeMinimumVariancePivot(int nAttr){	int nRows = GetSize();	GPointerArray arr(nRows);	int n;	for(n = 0; n < nRows; n++)		arr.AddPointer(GetVector(n));	arr.Sort(ComputeMinimumVariancePivotComparer, &nAttr);	double dBestPivotScore = 1e100;	double dBestPivot = 0;	double dPivot, d;	double* pRow1;	double* pRow2;	double dMean1, dMean2, dVar1, dVar2;	int nCount1, nCount2, i;	for(n = nRows - 2; n >= 0; n--)	{		// Try a pivot		pRow1 = (double*)arr.GetPointer(n);		pRow2 = (double*)arr.GetPointer(n + 1);		dPivot = (pRow1[nAttr] + pRow2[nAttr]) / 2;		// Compute the mean of each half		dMean1 = 0;		dMean2 = 0;		nCount1 = 0;		nCount2 = 0;		for(i = 0; i < nRows; i++)		{			pRow1 = GetVector(i);			if(pRow1[nAttr] < dPivot)			{				nCount1++;				dMean1 += pRow1[nAttr];			}			else			{				nCount2++;				dMean2 += pRow1[nAttr];			}		}		dMean1 /= nCount1;		dMean2 /= nCount2;		// Compute the variance of each half		dVar1 = 0;		dVar2 = 0;		for(i = 0; i < nRows; i++)		{			pRow1 = GetVector(i);			if(pRow1[nAttr] < dPivot)			{				d = pRow1[nAttr] - dMean1;				dVar1 += (d * d);			}			else			{				d = pRow2[nAttr] - dMean2;				dVar2 += (d * d);			}		}		dVar1 /= nCount1;		dVar2 /= nCount2;		d = dVar1 + dVar2;				// See if we've got a new best score		if(d < dBestPivotScore)		{			dBestPivotScore = d;			dBestPivot = dPivot;		}	}	return dBestPivot;}bool GArffData::PickPivotToReduceInfo(double* pOutPivot, double* pOutputInfo, GArffRelation* pRelation, int nAttr){	int nRows = GetSize();	int n;	double dBestPivotScore = 1e100;	double dPivot, d;	double* pRow1;	double* pRow2;	bool bGotOne = false;	for(n = 0; n < 10; n++)	{		// Try a pivot		pRow1 = GetVector(rand() % nRows);		pRow2 = GetVector(rand() % nRows);		dPivot = (pRow1[nAttr] + pRow2[nAttr]) / 2;		// Split at the pivot and measure the sum info		GArffData* pData2 = SplitByPivot(nAttr, dPivot);		if(GetSize() > 0 && pData2->GetSize() > 0)		{			d = (pRelation->MeasureTotalOutputInfo(this) * GetSize() + pRelation->MeasureTotalOutputInfo(pData2) * pData2->GetSize()) / (double)(GetSize() + pData2->GetSize());			// See if we've got a new best score			if(d < dBestPivotScore)			{				dBestPivotScore = d;				*pOutPivot = dPivot;				bGotOne = true;			}		}		Merge(pData2);		delete(pData2);	}	*pOutputInfo = dBestPivotScore;	return bGotOne;}void GArffData::ComputePrincipleComponent(int nDims, double* pOutVector, int nIterations, bool bExtract){	// Initialize the out-vector to a random direction and compute the mean	int i, j, n;	double* pMean = new double[2 * nDims];	Holder<double*> hMean(pMean);	for(j = 0; j < nDims; j++)	{		pOutVector[j] = GBits::GetRandomDouble();		pMean[j] = ComputeMean(j);	}	GVector::Normalize(pOutVector, nDims);	// Translate the data such that the mean is at the origin	double* pVector;	int nCount = GetSize();	for(n = 0; n < nCount; n++)	{		pVector = GetVector(n);		for(j = 0; j < nDims; j++)			pVector[j] -= pMean[j];	}	// Iterate	double* pAccumulator = pMean + nDims;	double d;	for(i = 0; i < nIterations; i++)	{		for(j = 0; j < nDims; j++)			pAccumulator[j] = 0;		for(n = 0; n < nCount; n++)		{			pVector = GetVector(n);			d = GVector::ComputeDotProduct(pVector, pOutVector, nDims);			for(j = 0; j < nDims; j++)				pAccumulator[j] += pVector[j] * d;		}		GVector::Normalize(pAccumulator, nDims);		//if(GVector::ComputeSquaredDistance(pAccumulator, pOutVector, nDims) < 1e-18)		//	break;		memcpy(pOutVector, pAccumulator, sizeof(double) * nDims);	}	// Normalize	GVector::Normalize(pOutVector, nDims);	// Optionally remove the component	if(bExtract)	{		for(i = 0; i < nCount; i++)		{			pVector = GetVector(i);			d = GVector::ComputeDotProduct(pVector, pOutVector, nDims);			for(j = 0; j < nDims; j++)				pVector[j] -= d * pOutVector[j];		}	}	// Restore the data to its original position	for(n = 0; n < nCount; n++)	{		pVector = GetVector(n);		for(j = 0; j < nDims; j++)			pVector[j] += pMean[j];	}}double GArffData::ComputeCovariance(int nAttr1, double dMean1, int nAttr2, double dMean2){	int nRowCount = GetSize();	double* pVector;	double dSum = 0;	int i;	for(i = 0; i < nRowCount; i++)	{		pVector = GetVector(i);		dSum += ((pVector[nAttr1] - dMean1) * (pVector[nAttr2] - dMean2));	}	return dSum / (nRowCount - 1); // todo: why do we subtract one here? Is that ALWAYS the right thing to do?}void GArffData::ComputeCovarianceMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation){	// Resize the matrix	int nInputs = pRelation->GetInputCount();	pOutMatrix->Resize(nInputs, nInputs);	// Compute the deviations	Holder<double*> hMeans(new double[nInputs]);	double* pMeans = hMeans.Get();	int nRowCount = GetSize();	double* pRow;	int n, i, nIndex;	for(i = 0; i < nInputs; i++)	{		nIndex = pRelation->GetInputIndex(i);		// Compute the mean		double dSum = 0;		for(n = 0; n < nRowCount; n++)		{			pRow = GetVector(n);			dSum += pRow[nIndex];		}		pMeans[i] = dSum / nRowCount;	}	// Compute the covariances for half the matrix	for(i = 0; i < nInputs; i++)	{		for(n = i; n < nInputs; n++)			pOutMatrix->Set(i, n, ComputeCovariance(pRelation->GetInputIndex(i), pMeans[i], pRelation->GetInputIndex(n), pMeans[n]));	}	// Fill out the other half of the matrix	for(i = 1; i < nInputs; i++)	{		for(n = 0; n < i; n++)			pOutMatrix->Set(i, n, pOutMatrix->Get(n, i));	}}void GArffData::ComputeCoprobabilityMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation, int nAttr, double noDataValue){	// Resize the matrix	GArffAttribute* pAttr = pRelation->GetAttribute(nAttr);	int nRows = pAttr->GetValueCount();	int nAttributes = pRelation->GetAttributeCount();	int nCols = 0;	int i;	for(i = 0; i < nAttributes; i++)	{		GArffAttribute* pAttrCol = pRelation->GetAttribute(i);		nCols += pAttrCol->GetValueCount();	}	pOutMatrix->Resize(nRows, nCols);	// Compute the coprobabilities	int nRowCount = GetSize();	int row, col, nMatch, nTotal, nAttrCol, nVal;	double* pRow;	for(row = 0; row < nRows; row++)	{		col = 0;		for(nAttrCol = 0; nAttrCol < nAttributes; nAttrCol++)		{			GArffAttribute* pAttrCol = pRelation->GetAttribute(nAttrCol);			for(nVal = 0; nVal < pAttrCol->GetValueCount(); nVal++)			{				nMatch = 0;				nTotal = 0;				for(i = 0; i < nRowCount; i++)				{					pRow = GetVector(i);					if((int)pRow[nAttrCol] == nVal)					{						nTotal++;						if((int)pRow[nAttr] == row)							nMatch++;					}				}				if(nTotal == 0)					pOutMatrix->Set(row, col, noDataValue);				else					pOutMatrix->Set(row, col, (double)nMatch / nTotal);				col++;			}		}		GAssert(col == nCols, "problem with columns");	}}int DimensionComparer(void* pThis, void* pA, void* pB){	int nDim = *(int*)pThis;	if(((double*)pA)[nDim] < ((double*)pB)[nDim])		return -1;	else if(((double*)pA)[nDim] > ((double*)pB)[nDim])		return 1;	else		return 0;}void GArffData::Sort(int nDimension){	GPointerArray::Sort(DimensionComparer, &nDimension);}/*GArffData* GArffData::SlowFourierTransform(GArffRelation* pRel, bool bForward){	int nCount = GetSize();	GAssert(nCount > 1, "Must have at least two points");	GArffData* pTransformed = new GArffData(nCount);	double dSumReal, dSumImag, dTwidReal, dTwidImag, dR, dI, dTmp;	double* pVec1;	double* pVec2;	double* pOutputVector;	int nInputCount = pRel->GetInputCount();	int nOutputCount = pRel->GetOutputCount();	GAssert((nOutputCount & 1) == 0, "Expected an even number of outputs. Even=Real, Odd=Imag");	int i, j, nInput, nOutput, indexReal, indexImag, inputIndex;	double dCircum = bForward ? -2.0 * PI : 2.0 * PI;	// Compute the mins and ranges	double* pMinsAndRanges = new double[nInputCount * 2];	ArrayHolder<double*> hMinsAndRanges(pMinsAndRanges);	for(nInput = 0; nInput < nInputCount; nInput++)	{		GetMinAndRange(pRel->GetInputIndex(nInput), &pMinsAndRanges[2 * nInput], &pMinsAndRanges[2 * nInput + 1]);		pMinsAndRanges[2 * nInput + 1] += (pMinsAndRanges[2 * nInput + 1] / (nCount - 1));	}	// Perform the transform	for(i = 0; i < nCount; i++)	{		pOutputVector = new double[nInputCount + nOutputCount];		pVec1 = GetVector(i);		for(nInput = 0; nInput < nInputCount; nInput++)		{			j = pRel->GetInputIndex(nInput);			pOutputVector[j] = pVec1[j];		}		for(nOutput = 0; nOutput < nOutputCount; nOutput += 2)		{			indexReal = pRel->GetOutputIndex(nOutput);			indexImag = pRel->GetOutputIndex(nOutput + 1);			dSumReal = 0;			dSumImag = 0;			for(j = 0; j < nCount; j++)			{				pVec2 = GetVector(j);				dTwidReal = 1;				dTwidImag = 0;				for(nInput = 0; nInput < nInputCount; nInput++)				{					inputIndex = pRel->GetInputIndex(nInput);					dTmp = dCircum * (pVec1[inputIndex] - pMinsAndRanges[2 * nInput]) * (pVec2[inputIndex] - pMinsAndRanges[2 * nInput]) / pMinsAndRanges[2 * nInput + 1];					dR = cos(dTmp);					dI = sin(dTmp);					dTmp = dTwidReal;					dTwidReal = dTwidReal * dR - dTwidImag * dI;					dTwidImag = dTmp * dI + dR * dTwidImag;				}				dSumReal += dTwidReal * pVec2[indexReal] - dTwidImag * pVec2[indexImag];				dSumImag += dTwidReal * pVec2[indexImag] + dTwidImag * pVec2[indexReal];			}			pOutputVector[indexReal] = dSumReal;			pOutputVector[indexImag] = dSumImag;		}		pTransformed->AddVector(pOutputVector);	}	// Scale the reverse transform	if(!bForward)	{		double dFactor = 1.0;		for(nInput = 0; nInput < nInputCount; nInput++)			dFactor /= pMinsAndRanges[2 * nInput + 1];		for(i = 0; i < nCount; i++)		{			pOutputVector = pTransformed->GetVector(i);			for(nOutput = 0; nOutput < nOutputCount; nOutput++)				pOutputVector[pRel->GetOutputIndex(nOutput)] *= dFactor;		}	}	return pTransformed;}*/void GArffData::AddGaussianNoiseDimensions(GArffRelation* pRelation, int nNoiseDims){	int nOldAttributes = pRelation->GetAttributeCount();	int i, j;	for(i = 0; i < nNoiseDims; i++)		pRelation->AddAttribute(new GArffAttribute(true, 0, NULL));	int nNewAttributes = pRelation->GetAttributeCount();	for(i = 0; i < GetSize(); i++)	{		double* pOldVector = GetVector(i);		double* pNewVector = new double[nNewAttributes];		memcpy(pNewVector, pOldVector, sizeof(double) * nOldAttributes);		for(j = nOldAttributes; j < nNewAttributes; j++)			pNewVector[j] = GBits::GetRandomGaussian();		SwapVector(i, pNewVector);		delete[] pOldVector;	}}#ifndef NO_TEST_CODE// staticvoid GArffData::Test(){	// Make some data	GArffData data(100);	int i;	for(i = 0; i < 100; i++)	{		double* pNewVector = new double[2];		pNewVector[0] = GBits::GetRandomDouble();		pNewVector[1] = 2 * pNewVector[0];		data.AddVector(pNewVector);	}	// Find principle components	double eig[2];	data.ComputePrincipleComponent(2, eig, 10, false);	if(ABS(eig[0] * 2 - eig[1]) > .0001)		throw "incorrect value";	// Compute principle components via eigenvectors of covariance matrix, and	// make sure they're the same	GArffRelation rel;	rel.AddAttribute(new GArffAttribute(true, 0, NULL));	rel.AddAttribute(new GArffAttribute(true, 0, NULL));	GMatrix m;	data.ComputeCovarianceMatrix(&m, &rel);	GMatrix eigenVectors;	eigenVectors.ComputeEigenVectors(1, &m);	if(ABS(eigenVectors.Get(0, 0) * eigenVectors.Get(0, 1) - eig[0] * eig[1]) > .0001)		throw "answers don't agree";/*	// Test SlowFourierTransform	GArffRelation rel2;	rel2.AddAttribute(new GArffAttribute(true, 0, NULL));	rel2.AddAttribute(new GArffAttribute(false, 0, NULL));	rel2.AddAttribute(new GArffAttribute(false, 0, NULL));	GArffData data2(4);	double* pVec;	pVec = new double[3]; pVec[0] = 0; pVec[1] = 1; pVec[2] = 0; data2.AddVector(pVec);	pVec = new double[3]; pVec[0] = 1.4; pVec[1] = 3; pVec[2] = 0; data2.AddVector(pVec);	pVec = new double[3]; pVec[0] = 1.6; pVec[1] = 2; pVec[2] = 0; data2.AddVector(pVec);	pVec = new double[3]; pVec[0] = 3; pVec[1] = 4; pVec[2] = 0; data2.AddVector(pVec);	GArffData* pFourierData = data2.SlowFourierTransform(&rel2, true);	for(i = 0; i < 4; i++)	{		pVec = pFourierData->GetVector(i);		//pVec[0] = i;	}	GArffData* pRoundTripData = pFourierData->SlowFourierTransform(&rel2, false);	for(i = 0; i < 4; i++)		pVec = pRoundTripData->GetVector(i);*/}#endif // !NO_TEST_CODE

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?