📄 dataset.java
字号:
package ai.decision.algorithm;
import java.io.*;
import java.util.*;
import java.net.*;
/**
* The Dataset class encapsulates a dataset used to build a decision tree.
* A Dataset object can be queried for attribute information (used during the
* splitting process).
*
* <p>
* The class manages and provides access to:
*
* <p>
* <ul>
* <li>Raw data tuples (for training and testing).
* <li>Target attribute information:
* <ul>
* <li>The name of the target attribute for this dataset.
* <li>Possible values for the target attribute.
* </ul>
* <li>General attribute information:
* <ul>
* <li>The name of each attribute.
* <li>Possible values for each attribute.
* </ul>
* </ul>
*
* <p>
* Data tuples are stored internally as integer arrays. Each
* array cell contains an integer that corresponds to a particular
* attribute value index (as defined by the order of attributes
* and attribute values in the metadata file). This is a
* reasonably efficient way to store large datasets.
*
* <p>
* Once a dataset has been loaded, it is immutable - no new
* examples can be stored. To extend or change a dataset, a
* new Dataset object must be created.
*
* <p>
* <b>Change History:</b>
*
* <p><pre>
* Name: Date: Change:
* =============================================================
* J. Kelly May-03-2000 Created.
* J. Kelly May-12-2000 Moved attribute classes
* (now external).
* J. Kelly Sep-27-2000 Added support for
* testing set.
* J. Kelly Feb-06-2001 Added ability to create
* random testing sets.
* </pre>
*
* Copyright 2000 University of Alberta.
*
* <!--
* This file is part of the Decision Tree Applet.
*
* The Decision Tree Applet is free software; you can redistribute it
* and/or modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* Foobar is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with the Decision Tree Applet; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
* -->
*/
public class Dataset
{
// Debug data members
boolean DEBUG_ON = true; // Turn on/off debugging info.
// Instance data members
Vector m_attributes; // Attributes for data set.
Vector m_trainingSet; // Training data storage.
Vector m_testingSet; // Testing data storage.
int[] m_targetSums; // Number of examples in each
// target attribute class.
// Constructors
/**
* Builds a new Dataset. Creates a FileParser
* to parse the metadata and example files for
* the dataset.
*
* <p>
* This constructor retrieves the data from the specified
* URL.
*
* @param repository The location of the metadata and
* example files.
*
* @param metaFile The name of the metadata file -
* the constructor attempts to open the effective
* URL: repository + metaFile for reading.
*
* @throws MalformedURLException if the URL protocol is
* unrecognized.
*
* @throws InvalidMetaFileException If the
* metadata file contains syntax errors.
*
* @throws InvalidDataFileException If the data
* (example) file contains syntax errors.
*
* @throws IOException If the configuration or data file
* cannot be read.
*/
public Dataset( URL repository, String metaFile )
throws MalformedURLException,
InvalidMetaFileException,
InvalidDataFileException,
IOException
{
m_attributes = new Vector();
m_trainingSet = new Vector();
m_testingSet = new Vector();
// Build the URL for the meta file.
URL metaFileURL =
new URL( repository + metaFile );
// Attempt to open an input stream attached to the file.
InputStream metaInputStream = metaFileURL.openStream();
// First, we parse the metadata file
// and grab the various attribute names etc.
String dataFile = parseMetaFile( metaInputStream );
// The data file location specified in the meta file
// is relative to the meta file location.
String fileRootPath = new String();
int pos;
if( (pos = metaFile.lastIndexOf( '/' )) != -1 )
fileRootPath = metaFile.substring( 0, pos + 1 );
// Build the URL for the data file.
URL dataFileURL =
new URL( repository + fileRootPath + dataFile );
InputStream dataInputStream = dataFileURL.openStream();
// Initialize the target counts array.
m_targetSums = new int[ getTargetAttribute().getNumValues() ];
// Now, we parse the data file and store the data in memory.
parseDataFile( dataInputStream );
}
/**
* Builds a new Dataset. Creates a FileParser to parse the metadata
* and example files for the dataset.
*
* @param metaFile A file containing the metadata
* for this dataset and a pointer to the actual
* example file.
*
* @throws InvalidMetaFileException If the
* metadata file contains syntax errors.
*
* @throws InvalidDataFileException If the data
* (example) file contains syntax errors.
*
* @throws IOException If the configuration or data file
* cannot be read.
*/
public Dataset( String metaFile )
throws InvalidMetaFileException,
InvalidDataFileException,
IOException
{
m_attributes = new Vector();
m_trainingSet = new Vector();
m_testingSet = new Vector();
// First, we parse the configuration file and grab the various
// attribute names etc.
FileInputStream metaInputStream =
new FileInputStream( metaFile );
String dataFile = parseMetaFile( metaInputStream );
// The data file location specified in the meta file
// is relative to the meta file location.
String fileRootPath = new String();
int pos;
if( (pos = metaFile.lastIndexOf( '/' )) != -1 )
fileRootPath = metaFile.substring( 0, pos + 1 );
// Now, we parse the data file and store the data in memory.
FileInputStream dataInputStream =
new FileInputStream( fileRootPath + dataFile );
// Initialize the target counts array.
m_targetSums = new int[ getTargetAttribute().getNumValues() ];
parseDataFile( dataInputStream );
}
// Public methods
/**
* Returns the target attribute for this data set.
*
* @return An Attribute object for the target attribute
* in this dataset.
*/
public Attribute getTargetAttribute()
{
// The target attribute is always stored at
// position 0 in the attributes vector.
return (Attribute)m_attributes.elementAt( 0 );
}
/**
* Finds and returns a particular attribute.
*
* @return An Attribute object with the specified
* name.
*
* @throws NonexistentAttributeException If
* the attribute does not exist in the dataset.
*/
public Attribute getAttributeByName( String attName )
throws NonexistentAttributeException
{
// Inefficient linear search of the vector
for( int i = 0; i < m_attributes.size(); i++ ) {
Attribute att = (Attribute)m_attributes.elementAt(i);
if( att.getName().equals( attName ) ) return att;
}
throw new
NonexistentAttributeException( "Attribute " + attName +
" does not exist." );
}
/**
* Finds and returns a particular attribute, using the attribute's
* location in the internal attributes Vector.
*
* <p>
* This method is primarily available to allow for iteration over all
* attributes in the dataset.
*
* @return An Attribute object stored at the
* specified index in the attributes vector.
*
* @throws NonexistentAttributeException If
* the attribute does not exist in the dataset
* (i.e. the index is out of range).
*
*/
public Attribute getAttributeByNum( int attNum )
throws NonexistentAttributeException
{
if( attNum < 0 || attNum >= m_attributes.size() )
throw new
NonexistentAttributeException( "Attribute" +
" at location " + attNum + " does not exist." );
return (Attribute)m_attributes.elementAt(attNum);
}
/**
* Finds and returns the position of a particular attribute in the
* Dataset's internal storage list.
*
* @param attName The name of the attribute to locate.
*
* @return The position of the attribute in
* the Dataset's internal storage list.
*
* @throws NonexistentAttributeException If an
* attribute value with the supplied name
* does not exist.
*/
public int getAttributePosition( String attName )
throws NonexistentAttributeException
{
// Inefficient linear search of the vector
for( int i = 0; i < m_attributes.size(); i++ ) {
String name = ((Attribute)m_attributes.elementAt(i)).getName();
if( name.equals( attName ) ) return i;
}
throw new
NonexistentAttributeException( "Attribute" +
" named " + attName + " does not exist." );
}
/**
* Returns the number of attributes (including the
* target attribute) in this dataset.
*
* @return The total number of attributes in the
* dataset, including the target attribute.
*/
public int getNumAttributes()
{
return m_attributes.size();
}
/**
* Returns the number of training examples <i>currently</i> in
* this dataset.
*
* @return The total number of training examples in the dataset.
*/
public int getNumTrainingExamples()
{
return m_trainingSet.size();
}
/**
* Returns the number of testing examples currently
* in this dataset.
*
* @return The total number of testing examples in
* the dataset.
*/
public int getNumTestingExamples()
{
return m_testingSet.size();
}
/**
* Creates and returns a Vector that contains the names of
* all attributes in this data set (including the target
* attribute).
*
* @return A Vector containing Strings which are the
* names of all the attributes (in the order
* that they were parsed from the metadata file -
* target attribute first).
*/
public Vector getAttributeNames()
{
// Create and fill the vector of names
Vector names = new Vector();
for( int i = 0; i < m_attributes.size(); i++ )
names.add( ((Attribute)m_attributes.elementAt(i)).getName() );
return names;
}
/**
* Returns an iterator over the training examples in the current dataset.
*
* @return An iterator over all the training examples.
*/
public Iterator getTrainingExamples()
{
return m_trainingSet.iterator();
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -