glearner.cpp

来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C++ 代码 · 共 276 行

CPP
276
字号
/*	Copyright (C) 2006, Mike Gashler	This library is free software; you can redistribute it and/or	modify it under the terms of the GNU Lesser General Public	License as published by the Free Software Foundation; either	version 2.1 of the License, or (at your option) any later version.	see http://www.gnu.org/copyleft/lesser.html*/#include "GLearner.h"#include "GArff.h"#include <stdlib.h>#include <string.h>#include "GMacros.h"GSupervisedLearner::GSupervisedLearner(GArffRelation* pRelation){	m_pRelation = pRelation;}GSupervisedLearner::~GSupervisedLearner(){}double GSupervisedLearner::MeasurePredictiveAccuracy(GArffData* pData){	int nInputCount = m_pRelation->GetInputCount();	int nOutputCount = m_pRelation->GetOutputCount();	int nAttributes = m_pRelation->GetAttributeCount();	int nIndex;	GTEMPBUF(double, pVector, nAttributes);	double* pRow;	int nRowCount = pData->GetSize();	double dCorrectCount = 0;	double d;	int nTotalCount = 0;	int n, i;	for(n = 0; n < nRowCount; n++)	{		pRow = pData->GetVector(n);		// Copy the input values into the vector		for(i = 0; i < nInputCount; i++)		{			nIndex = m_pRelation->GetInputIndex(i);			pVector[nIndex] = pRow[nIndex];		}		// Mess up the output values just to be safe		for(i = 0; i < nOutputCount; i++)		{			nIndex = m_pRelation->GetOutputIndex(i);			pVector[nIndex] = 1e100;		}		// Evaluate		Eval(pVector);		// Check the answer		for(i = 0; i < nOutputCount; i++)		{			nIndex = m_pRelation->GetOutputIndex(i);			if(m_pRelation->GetAttribute(nIndex)->IsContinuous())			{				// Predictive accuracy doesn't really make sense for real values,				// so we'll just use a squashed squared error for an estimate				d = pRow[nIndex] - pVector[nIndex];				dCorrectCount += (1.0 - (1.0 / (1.0 + (d * d))));			}			else			{				if((int)pVector[nIndex] == (int)pRow[nIndex])					dCorrectCount++;			}			nTotalCount++;		}	}	return dCorrectCount / nTotalCount;}double GSupervisedLearner::MeasureMeanSquaredError(GArffData* pData){	int nInputCount = m_pRelation->GetInputCount();	int nOutputCount = m_pRelation->GetOutputCount();	int nAttributes = m_pRelation->GetAttributeCount();	int nIndex;	GTEMPBUF(double, pVector, nAttributes);	double* pRow;	int nRowCount = pData->GetSize();	double dError = 0;	double d;	int n, i;	for(n = 0; n < nRowCount; n++)	{		pRow = pData->GetVector(n);		// Copy the input values into the sample		for(i = 0; i < nInputCount; i++)		{			nIndex = m_pRelation->GetInputIndex(i);			pVector[nIndex] = pRow[nIndex];		}		// Mess up the output values just to be safe		for(i = 0; i < nOutputCount; i++)		{			nIndex = m_pRelation->GetOutputIndex(i);			pVector[nIndex] = 1e100;		}		// Evaluate		Eval(pVector);		// Check the answer		for(i = 0; i < nOutputCount; i++)		{			nIndex = m_pRelation->GetOutputIndex(i);			if(m_pRelation->GetAttribute(nIndex)->IsContinuous())			{				d = pRow[nIndex] - pVector[nIndex];				dError += (d * d);			}			else			{				// Squared error doesn't really make sense for discreet				// values, so we'll just say an incorrect classification				// corresponds to an error of 1.				if((int)pVector[nIndex] != (int)pRow[nIndex])					dError += 1;			}		}	}	return dError / (nRowCount * nOutputCount);}double GSupervisedLearner::CrossValidate(GArffData* pData, int nFolds, bool bRegression){	// Split the data into parts	GArffData** pSets = (GArffData**)alloca(sizeof(GArffData*) * nFolds);	int nSize = pData->GetSize() / nFolds + nFolds;	int n, i, j;	for(n = 0; n < nFolds; n++)		pSets[n] = new GArffData(nSize);	int nRowCount = pData->GetSize();	double* pRow;	for(n = 0; n < nRowCount; n++)	{		pRow = pData->GetVector(n);		pSets[n % nFolds]->AddVector(pRow);	}	// Do the training and testing	double d;	double dScore = 0;	for(n = 0; n < nFolds; n++)	{		// Merge all sets but one		GArffData* pTrainer = new GArffData(pData->GetSize());		for(i = 0; i < nFolds; i++)		{			if(i == n)				continue;			int nCount = pSets[i]->GetSize();			for(j = 0; j < nCount; j++)			{				pRow = pSets[i]->GetVector(j);				pTrainer->AddVector(pRow);			}		}		// Train the learner		Reset();		Train(pTrainer);		// Test it		if(bRegression)			d = MeasureMeanSquaredError(pSets[n]);		else			d = MeasurePredictiveAccuracy(pSets[n]);		dScore += d;		// Clean up		pTrainer->DropAllVectors();		delete(pTrainer);	}	dScore /= nFolds;	// Clean up	for(n = 0; n < nFolds; n++)	{		pSets[n]->DropAllVectors();		delete(pSets[n]);	}	return dScore;}// ---------------------------------------------------------------GBaselineLearner::GBaselineLearner(GArffRelation* pRelation) : GSupervisedLearner(pRelation){	int nOutputs = pRelation->GetOutputCount();	m_pOutputs = new double[nOutputs];}// virtualGBaselineLearner::~GBaselineLearner(){	delete[] m_pOutputs;}// virtualvoid GBaselineLearner::Reset(){}// virtualvoid GBaselineLearner::Train(GArffData* pData){	int nOutputs = m_pRelation->GetOutputCount();	int i, j, index, val;	int nCount = pData->GetSize();	int nMaxValues = 0;	for(i = 0; i < nOutputs; i++)		nMaxValues = MAX(nMaxValues, m_pRelation->GetAttribute(m_pRelation->GetOutputIndex(i))->GetValueCount());	GTEMPBUF(int, counts, nMaxValues);	double dSum;	GArffAttribute* pAttr;	double* pVector;	for(i = 0; i < nOutputs; i++)	{		index = m_pRelation->GetOutputIndex(i);		pAttr = m_pRelation->GetAttribute(index);		if(pAttr->IsContinuous())		{			dSum = 0;			for(j = 0; j < nCount; j++)			{				pVector = pData->GetVector(j);				dSum += pVector[index];			}			m_pOutputs[i] = dSum / nCount;		}		else		{			memset(counts, '\0', sizeof(int) * pAttr->GetValueCount());			for(j = 0; j < nCount; j++)			{				pVector = pData->GetVector(j);				val = (int)pVector[index];				if(val >= 0)					counts[val]++;			}			val = 0;			for(j = 1; j < pAttr->GetValueCount(); j++)			{				if(counts[j] > counts[val])					val = j;			}			m_pOutputs[i] = (double)val;		}	}}// virtualvoid GBaselineLearner::Eval(double* pVector){	int nOutputs = m_pRelation->GetOutputCount();	int i;	for(i = 0; i < nOutputs; i++)		pVector[m_pRelation->GetOutputIndex(i)] = m_pOutputs[i];}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?