⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dataset.java

📁 一个决策树的Applet(转载
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package ai.decision.algorithm;

import java.io.*;
import java.util.*;
import java.net.*;

/**
 * The Dataset class encapsulates a dataset used to build a decision tree.
 * A Dataset object can be queried for attribute information (used during the
 * splitting process).
 *
 * <p>
 * The class manages and provides access to:
 *
 * <p>
 * <ul>
 *     <li>Raw data tuples (for training and testing).
 *     <li>Target attribute information:
 *     <ul>
 *         <li>The name of the target attribute for this dataset.
 *         <li>Possible values for the target attribute.
 *     </ul>
 *     <li>General attribute information:
 *     <ul>
 *         <li>The name of each attribute.
 *         <li>Possible values for each attribute.
 *     </ul>
 * </ul>
 *
 * <p>
 * Data tuples are stored internally as integer arrays.  Each
 * array cell contains an integer that corresponds to a particular
 * attribute value index (as defined by the order of attributes
 * and attribute values in the metadata file).  This is a
 * reasonably efficient way to store large datasets.
 *
 * <p>
 * Once a dataset has been loaded, it is immutable - no new
 * examples can be stored.  To extend or change a dataset, a
 * new Dataset object must be created.
 *
 * <p>
 * <b>Change History:</b>
 *
 * <p><pre>
 * Name:            Date:            Change:
 * =============================================================
 * J. Kelly         May-03-2000      Created.
 * J. Kelly         May-12-2000      Moved attribute classes
 *                                   (now external).
 * J. Kelly         Sep-27-2000      Added support for
 *                                   testing set.
 * J. Kelly         Feb-06-2001      Added ability to create
 *                                   random testing sets.
 * </pre>
 *
 * Copyright 2000 University of Alberta.
 *
 * <!--
 * This file is part of the Decision Tree Applet.
 *
 * The Decision Tree Applet is free software; you can redistribute it
 * and/or modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * Foobar is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with the Decision Tree Applet; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * -->
 */
public class Dataset
{
  // Debug data members

  boolean DEBUG_ON = true;   // Turn on/off debugging info.

  // Instance data members

  Vector m_attributes;       // Attributes for data set.
  Vector m_trainingSet;      // Training data storage.
  Vector m_testingSet;       // Testing data storage.
  int[]  m_targetSums;       // Number of examples in each
                             // target attribute class.

  // Constructors

  /**
   * Builds a new Dataset.  Creates a FileParser
   * to parse the metadata and example files for
   * the dataset.
   *
   * <p>
   * This constructor retrieves the data from the specified
   * URL.
   *
   * @param repository The location of the metadata and
   *        example files.
   *
   * @param metaFile The name of the metadata file -
   *        the constructor attempts to open the effective
   *        URL: repository + metaFile for reading.
   *
   * @throws MalformedURLException if the URL protocol is
   *         unrecognized.
   *
   * @throws InvalidMetaFileException If the
   *         metadata file contains syntax errors.
   *
   * @throws InvalidDataFileException If the data
   *         (example) file contains syntax errors.
   *
   * @throws IOException If the configuration or data file
   *         cannot be read.
   */
  public Dataset( URL repository, String metaFile )
    throws MalformedURLException,
           InvalidMetaFileException,
           InvalidDataFileException,
           IOException
  {
    m_attributes  = new Vector();
    m_trainingSet = new Vector();
    m_testingSet  = new Vector();

    // Build the URL for the meta file.
    URL metaFileURL =
      new URL( repository + metaFile );

    // Attempt to open an input stream attached to the file.
    InputStream metaInputStream = metaFileURL.openStream();

    // First, we parse the metadata file
    // and grab the various attribute names etc.
    String dataFile = parseMetaFile( metaInputStream );

    // The data file location specified in the meta file
    // is relative to the meta file location.
    String fileRootPath = new String();
    int pos;

    if( (pos = metaFile.lastIndexOf( '/' )) != -1 )
      fileRootPath = metaFile.substring( 0, pos + 1 );

    // Build the URL for the data file.
    URL dataFileURL =
      new URL( repository + fileRootPath + dataFile );

    InputStream dataInputStream = dataFileURL.openStream();

    // Initialize the target counts array.
    m_targetSums = new int[ getTargetAttribute().getNumValues() ];

    // Now, we parse the data file and store the data in memory.
    parseDataFile( dataInputStream );
  }

  /**
   * Builds a new Dataset.  Creates a FileParser to parse the metadata
   * and example files for the dataset.
   *
   * @param metaFile A file containing the metadata
   *        for this dataset and a pointer to the actual
   *        example file.
   *
   * @throws InvalidMetaFileException If the
   *         metadata file contains syntax errors.
   *
   * @throws InvalidDataFileException If the data
   *         (example) file contains syntax errors.
   *
   * @throws IOException If the configuration or data file
   *         cannot be read.
   */
  public Dataset( String metaFile )
    throws InvalidMetaFileException,
           InvalidDataFileException,
           IOException
  {
    m_attributes  = new Vector();
    m_trainingSet = new Vector();
    m_testingSet  = new Vector();

    // First, we parse the configuration file and grab the various
    // attribute names etc.
    FileInputStream metaInputStream =
      new FileInputStream( metaFile );

    String dataFile = parseMetaFile( metaInputStream );

    // The data file location specified in the meta file
    // is relative to the meta file location.
    String fileRootPath = new String();
    int pos;

    if( (pos = metaFile.lastIndexOf( '/' )) != -1 )
      fileRootPath = metaFile.substring( 0, pos + 1 );

    // Now, we parse the data file and store the data in memory.
    FileInputStream dataInputStream =
      new FileInputStream( fileRootPath + dataFile );

    // Initialize the target counts array.
    m_targetSums = new int[ getTargetAttribute().getNumValues() ];

    parseDataFile( dataInputStream );
  }

  // Public methods

  /**
   * Returns the target attribute for this data set.
   *
   * @return An Attribute object for the target attribute
   *         in this dataset.
   */
  public Attribute getTargetAttribute()
  {
    // The target attribute is always stored at
    // position 0 in the attributes vector.
    return (Attribute)m_attributes.elementAt( 0 );
  }

  /**
   * Finds and returns a particular attribute.
   *
   * @return An Attribute object with the specified
   *         name.
   *
   * @throws NonexistentAttributeException If
   *         the attribute does not exist in the dataset.
   */
  public Attribute getAttributeByName( String attName )
    throws NonexistentAttributeException
  {
    // Inefficient linear search of the vector
    for( int i = 0; i < m_attributes.size(); i++ ) {
      Attribute att = (Attribute)m_attributes.elementAt(i);

      if( att.getName().equals( attName ) ) return att;
    }

    throw new
      NonexistentAttributeException( "Attribute " + attName +
                                     " does not exist." );
  }

  /**
   * Finds and returns a particular attribute, using the attribute's
   * location in the internal attributes Vector.
   *
   * <p>
   * This method is primarily available to allow for iteration over all
   * attributes in the dataset.
   *
   * @return An Attribute object stored at the
   *         specified index in the attributes vector.
   *
   * @throws NonexistentAttributeException If
   *         the attribute does not exist in the dataset
   *         (i.e. the index is out of range).
   *
   */
  public Attribute getAttributeByNum( int attNum )
    throws NonexistentAttributeException
  {
    if( attNum < 0 || attNum >= m_attributes.size() )
      throw new
        NonexistentAttributeException( "Attribute" +
          " at location " + attNum + " does not exist." );

    return (Attribute)m_attributes.elementAt(attNum);
  }

  /**
   * Finds and returns the position of a particular attribute in the
   * Dataset's internal storage list.
   *
   * @param attName The name of the attribute to locate.
   *
   * @return The position of the attribute  in
   *         the Dataset's internal storage list.
   *
   * @throws NonexistentAttributeException If an
   *         attribute value with the supplied name
   *         does not exist.
   */
  public int getAttributePosition( String attName )
    throws NonexistentAttributeException
  {
    // Inefficient linear search of the vector
    for( int i = 0; i < m_attributes.size(); i++ ) {
      String name = ((Attribute)m_attributes.elementAt(i)).getName();
      if( name.equals( attName ) ) return i;
    }

    throw new
      NonexistentAttributeException( "Attribute" +
        " named " + attName + " does not exist." );
  }

  /**
   * Returns the number of attributes (including the
   * target attribute) in this dataset.
   *
   * @return The total number of attributes in the
   *         dataset, including the target attribute.
   */
  public int getNumAttributes()
  {
    return m_attributes.size();
  }

  /**
   * Returns the number of training examples <i>currently</i> in
   * this dataset.
   *
   * @return The total number of training examples in the dataset.
   */
  public int getNumTrainingExamples()
  {
    return m_trainingSet.size();
  }

  /**
   * Returns the number of testing examples currently
   * in this dataset.
   *
   * @return The total number of testing examples in
   *         the dataset.
   */
  public int getNumTestingExamples()
  {
    return m_testingSet.size();
  }

  /**
   * Creates and returns a Vector that contains the names of
   * all attributes in this data set (including the target
   * attribute).
   *
   * @return A Vector containing Strings which are the
   *         names of all the attributes (in the order
   *         that they were parsed from the metadata file -
   *         target attribute first).
   */
  public Vector getAttributeNames()
  {
    // Create and fill the vector of names
    Vector names = new Vector();

    for( int i = 0; i < m_attributes.size(); i++ )
      names.add( ((Attribute)m_attributes.elementAt(i)).getName() );

    return names;
  }

  /**
   * Returns an iterator over the training examples in the current dataset.
   *
   * @return An iterator over all the training examples.
   */
  public Iterator getTrainingExamples()
  {
    return m_trainingSet.iterator();
  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -