garff.h
来自「一个由Mike Gashler完成的机器学习方面的includes neural」· C头文件 代码 · 共 357 行
H
357 行
/* Copyright (C) 2006, Mike Gashler This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. see http://www.gnu.org/copyleft/lesser.html*/#ifndef __GARFF_H__#define __GARFF_H__#include "GArray.h"#include "GSearch.h"// ARFF = Attribute-Relation File Formatclass GArffAttribute;class GArffData;class GPointerArray;class GMatrix;class GArffRelation{protected: char* m_szName; GPointerArray* m_pAttributes; int m_nInputCount; int* m_pInputIndexes; int m_nOutputCount; int* m_pOutputIndexes;public: GArffRelation(); ~GArffRelation(); // Parses an ARFF file and returns a GArffRelation and a GArffData. You must delete them both. // This will throw an exception if there's an error. (You should catch const char*) static void ParseArffFile(GArffRelation** ppOutRelation, GArffData** ppOutData, const char* szFile, int nLen); // Loads an ARFF file and returns a GArffRelation and a GArffData. You must delete them both. // This will throw an exception if there's an error. (You should catch const char*) static void LoadArffFile(GArffRelation** ppOutRelation, GArffData** ppOutData, const char* szFilename); // Writes out an ARFF file void SaveArffFile(GArffData* pData, const char* szFilename); // Returns the total number of attributes (both input and output) in this relation int GetAttributeCount(); // Add an attribute to the relation void AddAttribute(GArffAttribute* pAttr); // Returns the number of input attributes in this relation int GetInputCount(); // Returns the number of output attributes in this relation int GetOutputCount(); // Returns the attribute index of the n'th input attribute int GetInputIndex(int n); // Returns the attribute index of the n'th output attribute int GetOutputIndex(int n); // Returns the attribute at the specified attribute index GArffAttribute* GetAttribute(int nAttribute); // Returns the sum of entropy (for discreet attributes) and variance (for continuous // attributes) for all output values in the data set double MeasureTotalOutputInfo(GArffData* pData); // Returns the name of the relation const char* GetName() { return m_szName; } // Compute the square of the distance between the two points (using input values only) double ComputeInputDistanceSquared(double* pRow1, double* pRow2); // Compute the square of the distance between the two points (using output values only) double ComputeOutputDistanceSquared(double* pRow1, double* pRow2); // Computes the squared distance between input points after scaling by the value in // the array pInputScales. (pScales should be an array with size equal to // the number of attributes in the relation, even though only the values corresponding // to input attributes are actually used.) double ComputeScaledInputDistanceSquared(double* pRow1, double* pRow2, double* pScales); // Returns the number of continuous attributes in the relation int CountContinuousAttributes(); // Counts the size of the corresponding vector-mode input vector // Enumerations with nCap or more values will be treated as a single continuous value int CountVectorModeInputs(int nCap = 25); // Counts the size of the corresponding vector-mode output vector // Enumerations with nCap or more values will be treated as a single continuous value int CountVectorModeOutputs(int nCap = 25); // Converts a full normal-mode vector (pIn) to a vector-mode // input-only vector (pOut) // Enumerations with nCap or more values will be treated as a single continuous value void InputsToVectorMode(double* pIn, double* pOut, int nCap = 25); // Converts a full normal-mode vector (pIn) to a vector-mode // output-only vector (pOut) // Enumerations with nCap or more values will be treated as a single continuous value void OutputsToVectorMode(double* pIn, double* pOut, int nCap = 25); // Converts a vector-mode output-only vector (pIn) to a normal-mode // full vector (pOut). (The inputs of pOut are untouched). // Enumerations with nCap or more values will be treated as a single continuous value void VectorModeToOutputs(double* pIn, double* pOut, int nCap = 25);protected: double* ParseDataRow(const char* szFile, int nLen, int nLine, int nCommentAttributes); void CountInputs();};class GArffAttribute{protected: char* m_szName; int m_nValues; char** m_szValues; bool m_bIsInput; GArffAttribute();public: // If nValues is 0, then this is a continuous attribute. // szValues can be NULL if the values aren't named. GArffAttribute(bool bIsInput, int nValues, const char** szValues); ~GArffAttribute(); // Makes a deep copy of this object GArffAttribute* NewCopy(); // Parse the attribute section of a ".arff" file static GArffAttribute* Parse(const char* szFile, int nLen, int nLine); // Returns true if this is a continuous (as opposed to discreet) attribute bool IsContinuous() { return m_nValues == 0; } // Makes the attribute continuous void SetContinuous(); // Returns the index of the specified value int FindEnumeratedValue(const char* szValue); // Returns the number of discreet values in this attribute int GetValueCount(); // Returns the name of this attribute const char* GetName() { return m_szName; } // Returns the n'th discreet value that this attribute can have const char* GetValue(int n); // Returns true if this is an input attribute bool IsInput() { return m_bIsInput; } // Sets whether this is an input or output attribute. void SetIsInput(bool b) { m_bIsInput = b; }};class GArffData : public GPointerArray{public: // nGrowSize specifies the amount of space (number of vectors) to initially // allocate for data. It will dynamically resize as necessary. GArffData(int nGrowSize); ~GArffData(); // Takes ownership of pVector inline void AddVector(double* pVector) { AddPointer(pVector); } // Returns a pointer to the vector inline double* GetVector(int nIndex) { return ((double*)GetPointer(nIndex)); } // Adds a copy of the vector to the data set void CopyVector(double* pVector, int nAttributeCount); // Swaps pVector with the vector at nIndex. You're responsible to delete the // vector this returns double* SwapVector(int nIndex, double* pVector); // you must delete the vector this returns double* DropVector(int nIndex); // deletes the vector with the specified index void DeleteVector(int nIndex); // Abandons (leaks) all the vectors of data void DropAllVectors(); // Randomizes the order void Shuffle(); // Sorts the data from smallest to largest in the specified dimension void Sort(int nDimension); // Splits this set of data into two sets such that this set // contains all vectors where the value in element "nColumn" is // greater than dPivot and the set returned contains those // less-than-or-equal-to dPivot. GArffData* SplitByPivot(int nColumn, double dPivot); // Splits this set of data into a unique set for each // possible enumeration value of the attribute. You are // responsible to delete each set of data as well as the // array of pointers that this returns GArffData** SplitByAttribute(GArffRelation* pRelation, int nAttribute); // Splits this set of data into two sets such that this set // contains "nRows" vectors and the returned set contains the rest GArffData* SplitBySize(int nRows); // Steals all the vectors from pData and adds them to this set. // (You still have to delete pData) void Merge(GArffData* pData); // Measures the entropy of this set relative to the specified attribute double MeasureEntropy(GArffRelation* pRelation, int nColumn); // Snaps all non-continuous output values to the nearest discreet value void DiscretizeNonContinuousOutputs(GArffRelation* pRelation); // Finds the min and the range of the values of the specified attribute void GetMinAndRange(int nAttribute, double* pMin, double* pRange); // Computes the arithmetic mean of a single attribute double ComputeMean(int nAttribute); // Finds the arithmetic means of all attributes void GetMeans(double* pOutMeans, int nAttributes); // Computes the average variance of a single attribute double ComputeVariance(double dMean, int nAttribute); // Finds the average variance of all the attributes void GetVariance(double* pOutVariance, double* pMeans, int nAttributes); // Throws out all of the vectors in which any of the first "nAttributes" // attributes has a value that is more than "dStandardDeviations" // deviations away from the mean of that attribute. Note that a better // technique would be to compute Euclidian distance using all the // attributes together, but I was feeling too lazy when I wrote this. int RemoveOutlyers(double dStandardDeviations, int nAttributes); // Normalizes the specified attribute values void Normalize(int nAttribute, double dInputMin, double dInputRange, double dOutputMin, double dOutputRange); // Normalize a value from the input min and range to the output min and range static double Normalize(double dVal, double dInputMin, double dInputRange, double dOutputMin, double dOutputRange); // Produce a vector in which each attribute holds the most common value for that attribute double* MakeSetOfMostCommonOutputs(GArffRelation* pRelation); // Returns true if all output values in the data set are the same bool IsOutputHomogenous(GArffRelation* pRelation); // Replaces missing data with random values void RandomlyReplaceMissingData(GArffRelation* pRelation); // Replaces all missing data with the most common value for the attribute void ReplaceMissingAttributeWithMostCommonValue(GArffRelation* pRelation, int nAttribute); // This is an efficient algorithm for iteratively computing the principle component // vector of the data. See "EM Algorithms for PCA and SPCA" by Sam Roweis, 1998 NIPS. // if bExtract is true, it will remove the component from the data (so you can call it // again to get the second principle component, etc). void ComputePrincipleComponent(int nDims, double* pOutVector, int nIterations, bool bExtract); // Computes the covariance between two attributes double ComputeCovariance(int nAttr1, double dMean1, int nAttr2, double dMean2); // Computes the covariance matrix of the data void ComputeCovarianceMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation); // Computes the probability of each possible value for one attribute given knowledge of // a specific value for another of the attributes void ComputeCoprobabilityMatrix(GMatrix* pOutMatrix, GArffRelation* pRelation, int nAttr, double noDataValue); // Dump a representation of the data to stdout void Print(int nAttributes); // Computes the best pivot for minimizing the sum of the variance of each half double ComputeMinimumVariancePivot(int nAttr); // Computes the best pivot for minimizing the sum output info bool PickPivotToReduceInfo(double* pOutPivot, double* pOutputInfo, GArffRelation* pRelation, int nAttr);/* // This assumes that the relation has an even number of outputs. Even outputs (0, 2, 4, ...) represent // the real component of a complex output value and odd outputs (1, 3, 5, ...) represent the imaginary // component. GArffData* SlowFourierTransform(GArffRelation* pRel, bool bForward);*/ // Adds nNoiseDims dimensions of random gaussian dimensions to the data. (Also adds corresponding // attributes to pRelation). void AddGaussianNoiseDimensions(GArffRelation* pRelation, int nNoiseDims);#ifndef NO_TEST_CODE static void Test();#endif // !NO_TEST_CODE};class GArffDataRegressCritic : public GRealVectorCritic{protected: GArffData* m_pData; int m_nVariables; int m_nAttrX; int m_nAttrY;public: GArffDataRegressCritic(GArffData* pData, int nVariables, int nAttrX, int nAttrY) : GRealVectorCritic(nVariables) { m_pData = pData; m_nAttrX = nAttrX; m_nAttrY = nAttrY; } virtual ~GArffDataRegressCritic() {}protected: virtual double ApplyVariables(double dX, double* pVariables) = 0; virtual double ComputeError(double* pVector) { int nCount = m_pData->GetSize(); int i; double* pVec; double y; double dError = 0; for(i = 0; i < nCount; i++) { pVec = m_pData->GetVector(i); y = ApplyVariables(pVec[m_nAttrX], pVector); y -= pVec[m_nAttrY]; dError += (y * y); } return dError; }};#endif // __GARFF_H__
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?