📄 dataset.java
字号:
/**
* Returns an iterator over the testing examples in the current dataset.
*
* @return An iterator over all the testing examples.
*/
public Iterator getTestingExamples()
{
return m_testingSet.iterator();
}
/**
* Moves an example from the training set to
* the testing set. If the supplied example number
* is out of range no example is moved.
*
* @param exampleNum The example to transfer to the testing set.
*/
public void moveToTestingSet( int exampleNum )
{
if( exampleNum < 0 || exampleNum > (m_trainingSet.size() - 1) )
return;
m_testingSet.add( m_trainingSet.remove( exampleNum ) );
}
/**
* Moves an example from the training set to
* the testing set. If the supplied example number
* is out of range no example is moved.
*
* @param exampleNum The example to transfer to the training set.
*/
public void moveToTrainingSet( int exampleNum )
{
if( exampleNum < 0 || exampleNum > (m_testingSet.size() - 1) )
return;
m_trainingSet.add( m_testingSet.remove( exampleNum ) );
}
/**
* Returns a particular training example from the examples in the
* dataset.
*
* @return The selected example from the training
* dataset as an integer array.
*
* @throws IndexOutOfBoundsException If the example
* number is less than zero, or greater than
* the number of training examples in the dataset
* minus one.
*/
public int[] getTrainingExample( int exampleNum )
{
if( exampleNum < 0 || exampleNum >= m_trainingSet.size() )
throw new
IndexOutOfBoundsException( "Example number " +
exampleNum + " does not exist." );
return (int[])m_trainingSet.elementAt( exampleNum );
}
/**
* Returns a particular testing example from the examples in the
* dataset.
*
* @return The selected example from the testing
* dataset as an integer array.
*
* @throws IndexOutOfBoundsException If the example
* number is less than zero, or greater than
* the number of testing examples in the
* dataset minus one.
*/
public int[] getTestingExample( int exampleNum )
{
if( exampleNum < 0 || exampleNum >= m_testingSet.size() )
throw new
IndexOutOfBoundsException( "Example number " +
exampleNum + " does not exist." );
return (int[])m_testingSet.elementAt( exampleNum );
}
/**
* Creates a random testing dataset. Calling this
* method will destroy any previously built testing set.
*
* @param percentage Percentage of the entire dataset to
* use for testing.
*
* @param balanced <code>true</code> to create a balanced
* testing set, where the testing set and the
* remaining training set have the same proportion
* of each class.
*
* @throws IllegalArgumentException If the percentage value
* is < 0 or > 100.
*/
public void createRndTestSet( int percentage, boolean balanced )
{
if( percentage < 0 || percentage > 100 )
throw new
IllegalArgumentException( "Percentage value out of range." );
// Move any examples that are part of the current testing
// set back to the training set.
m_trainingSet.addAll( m_testingSet );
m_testingSet.clear();
// Calculate the number of examples that should be
// in the testing set.
int totalNumExamples = m_trainingSet.size();
int numTestingExamples =
Math.round( totalNumExamples * ((float)percentage)/100.0f );
Random rand = new Random();
// If the set doesn't have to be balanced, then just
// pick examples at random.
if( !balanced ) {
for( int i = 0; i < numTestingExamples; i++ ) {
m_testingSet.add(
m_trainingSet.remove( rand.nextInt( m_trainingSet.size() ) ) );
}
}
else {
// We have the target value distribution for the dataset,
// so reference it.
for( int i = 0; i < m_targetSums.length; i++ ) {
int numExamplesToMove =
Math.round( m_targetSums[i] / ((float)totalNumExamples)
* numTestingExamples );
for( int j = 0; j < numExamplesToMove; j++ ) {
// Attempt to randomly pick examples from the
// dataset that have the required target classification.
int[] example = null;
while( true ) {
example = (int[])
m_trainingSet.get( rand.nextInt( m_trainingSet.size() ) );
if( example[0] == i ) break;
}
m_testingSet.add( m_trainingSet.remove(
m_trainingSet.indexOf( example ) ) );
}
}
}
}
// Private methods
/**
* Parses a decision tree configuration file.
*
* @param metaFileInputStream An input stream attached
* to the metadata file.
*
* @return A String containing the name of the ID3
* data file to open and read.
*
* @throws An InvalidMetaFileException if a
* syntax error is encountered during the
* parsing process.
*
* @throws An IOException if a problem occurs while
* reading the configuration file.
*/
private String parseMetaFile( InputStream metaInputStream )
throws InvalidMetaFileException,
IOException
{
String buf; // temporary buffer
FileParser parser = new FileParser();
// 1. Open the metadata file and read in (ignoring comments).
parser.startMetaParse( metaInputStream );
// 2. Extract the target attribute name.
parser.moveToTargetAttribute();
String targetName = new String( parser.extractAttributeName() );
Vector targetValues = new Vector();
int numTargetValues;
// 3. Loop, extracting the possible values for the target attribute.
while( (buf = parser.extractAttributeValue()) != null )
targetValues.add( new String( buf ) );
// Build the target attribute and add it to the attributes vector.
Attribute targetAttribute = new Attribute( targetName, targetValues, 1 );
numTargetValues = targetValues.size();
m_attributes.add( targetAttribute );
// 4. Get ready to extract general attribute information.
parser.moveToAttributes();
// 5. Loop, extracting each attribute one after the other.
while( (buf = parser.extractAttributeName()) != null ) {
String attName = new String( buf );
Vector attValues = new Vector();
// 5.a. Loop, extracting the possible values for this
// attribute.
while( (buf = parser.extractAttributeValue()) != null )
attValues.add( new String( buf ) );
Attribute nextAtt = new Attribute( attName, attValues, numTargetValues );
m_attributes.add( nextAtt );
}
// 6. Extract the name of the data file.
parser.moveToDataFilePath();
String dataFilePath = parser.extractString();
//------------------------- DEBUG -------------------------
if( DEBUG_ON )
{
System.out.println( "Dataset::parseMetaFile: " +
"Finished parsing configuration file." );
System.out.println();
// Target attribute information.
System.out.println( "Target Attribute" );
System.out.println( "------------------------------" );
System.out.println();
System.out.println( "Target Attribute Name: " +
((Attribute)m_attributes.elementAt(0)).getName() );
System.out.println( "Number of Values: " +
((Attribute)m_attributes.elementAt(0)).getNumValues() );
System.out.println( "Possible Values: " +
((Attribute)m_attributes.elementAt(0)).getValueNames() );
System.out.println();
// General attribute information.
System.out.println( "Attributes" );
System.out.println( "------------------------------" );
System.out.println();
System.out.println( "Number of Attributes: " +
(getNumAttributes() - 1) );
System.out.println();
for( int i = 1; i < getNumAttributes(); i++ ) {
Attribute currAtt = null;
try {
currAtt = getAttributeByNum( i );
}
catch( NonexistentAttributeException e ) {
// can't happen
}
System.out.println( "Attribute " + i + " Name: " +
currAtt.getName() );
System.out.println( "Number of Values: " +
currAtt.getNumValues() );
System.out.println( "Possible Values: " +
currAtt.getValueNames() );
System.out.println();
}
// Data file information.
System.out.println( "Training/Testing Data" );
System.out.println( "------------------------------" );
System.out.println();
System.out.println( "Data will be extracted from " + dataFilePath );
System.out.println();
}
return dataFilePath;
}
/**
* Parses a decision tree data file.
*
* @param dataFileInputStream An input stream attached
* to the data file.
*
* @throws InvalidDataFileException If a
* syntax error is encountered during the
* parsing process.
*
* @throws IOException If a problem occurs while
* reading the data file.
*/
private void parseDataFile( InputStream dataInputStream )
throws InvalidDataFileException,
IOException
{
Vector rawSample; // Vector of strings extracted
// directly from the data file.
FileParser parser = new FileParser();
// 1. Open the data file.
parser.startDataParse( dataInputStream );
// 2. Start extracting samples.
while( (rawSample = parser.extractDataSample()) != null ) {
// We have the raw samples, so now we verify
// that all the values are legal and store the
// sample in compact form (using the integer index of
// each value).
if( rawSample.size() != m_attributes.size() )
throw new
InvalidDataFileException( "Syntax error in data file (line " +
parser.getCurrentLineNum() + "): Wrong " +
"number of attributes on line." );
int[] dataSample = new int[ m_attributes.size() ];
try {
// 2.a. Deal with all the attributes.
for( int i = 0; i < rawSample.size(); i++ ) {
// There should be a 1-to-1 ordering between
// the internal attributes vector and the
// raw sample vector.
Attribute currAtt = (Attribute)m_attributes.elementAt( i );
int attPos = currAtt.getAttributeValuePosition(
(String)rawSample.elementAt( i ) );
dataSample[i] = attPos;
if( i == 0 ) m_targetSums[ attPos ]++;
}
}
catch( NonexistentAttributeValueException e ) {
// One of the attribute values on this line of
// the data file doesn't correspond with the values
// specified in the config file.
throw new
InvalidDataFileException( "Syntax error in data " +
"file (line " + parser.getCurrentLineNum() +
"): Attribute value does not match any values in " +
"configuration file." );
}
// Add the last data sample to our current training set.
m_trainingSet.add( dataSample );
}
//------------------------- DEBUG -------------------------
if( DEBUG_ON ) {
System.out.println( "Dataset::parseDataFile: " +
"Finished parsing data file." );
System.out.println();
// Target attribute information.
System.out.println( "Data Samples" );
System.out.println( "------------------------------" );
System.out.println();
System.out.println( "Parsed and stored " + m_trainingSet.size() +
" data samples." );
System.out.println();
System.out.print( "Target classification counts are: " );
for( int i = 0; i < m_targetSums.length; i++ )
System.out.print( m_targetSums[i] + " " );
System.out.println();
System.out.println();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -