📄 csvloader.java

📁 一个数据挖掘系统的源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页

/**
 *
 *   AgentAcademy - an open source Data Mining framework for
 *   training intelligent agents
 *
 *   Copyright (C)   2001-2003 AA Consortium.
 *
 *   This library is open source software; you can redistribute it
 *   and/or modify it under the terms of the GNU Lesser General
 *   Public License as published by the Free Software Foundation;
 *   either version 2.0 of the License, or (at your option) any later
 *   version.
 *
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU Lesser General Public
 *   License along with this library; if not, write to the Free
 *   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *   MA  02111-1307 USA
 *
 */

package org.agentacademy.modules.dataminer.core.converters;

/**
 * <p>Title: The Data Miner prototype</p>
 * <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
 * <p>Copyright: Copyright (c) 2002</p>
 * <p>Company: CERTH</p>
 * @author asymeon
 * @version 0.3
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.Enumeration;
import java.util.Hashtable;

import org.agentacademy.modules.dataminer.core.Attribute;
import org.agentacademy.modules.dataminer.core.FastVector;
import org.agentacademy.modules.dataminer.core.Instance;
import org.agentacademy.modules.dataminer.core.Instances;
import org.apache.log4j.Logger;

/**
 * Reads a text file that is comma or tab delimited..
 *
 * @see Loader
 */
public class CSVLoader extends AbstractLoader {

   public static Logger                log = Logger.getLogger(CSVLoader.class);
  /**
   * Holds the determined structure (header) of the data set.
   */
  //@ protected depends: model_structureDetermined -> m_structure;
  //@ protected represents: model_structureDetermined <- (m_structure != null);
  protected Instances m_structure = null;

  /**
   * Holds the source of the data set.
   */
  //@ protected depends: model_sourceSupplied -> m_sourceFile;
  //@ protected represents: model_sourceSupplied <- (m_sourceFile != null);
  protected File m_sourceFile = null;

  /**
   * Describe variable <code>m_tokenizer</code> here.
   */
  //  private StreamTokenizer m_tokenizer = null;

  /**
   * A list of hash tables for accumulating nominal values during parsing.
   */
  private FastVector m_cumulativeStructure;

  /**
   * Holds instances accumulated so far
   */
  private FastVector m_cumulativeInstances;

  /**
   * Returns a string describing this attribute evaluator
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Reads a source that is in comma separated or tab separated format. "
      +"Assumes that the first row in the file determines the number of "
      +"and names of the attributes.";
  }

  /**
   * Resets the loader ready to read a new data set
   */
  public void reset() {
    m_structure = null;
  }

  /**
   * Resets the Loader object and sets the source of the data set to be
   * the supplied File object.
   *
   * @param file the source file.
   * @exception IOException if an error occurs
   */
  public void setSource(File file) throws IOException {
    reset();

    if (file == null) {
      throw new IOException("Source file object is null!");
    }

    m_sourceFile = file;
    try {
      BufferedReader br = new BufferedReader(new FileReader(file));
      br.close();
    } catch (FileNotFoundException ex) {
      throw new IOException("File not found");
    }
  }

  /**
   * Determines and returns (if possible) the structure (internally the
   * header) of the data set as an empty set of instances.
   *
   * @return the structure of the data set as an empty set of Instances
   * @exception IOException if an error occurs
   */
  public Instances getStructure() throws IOException {
    if (m_sourceFile == null) {
      throw new IOException("No source has been specified");
    }

    if (m_structure == null) {
      try {
	BufferedReader br = new BufferedReader(new FileReader(m_sourceFile));

	// assumes that the first line of the file is the header
	/*m_tokenizer = new StreamTokenizer(br);
	initTokenizer(m_tokenizer);
	readHeader(m_tokenizer); */
	StreamTokenizer st = new StreamTokenizer(br);
	initTokenizer(st);
	readStructure(st);
      } catch (FileNotFoundException ex) {
      }
    }

    return m_structure;
  }

  private void readStructure(StreamTokenizer st) throws IOException {
    readHeader(st);
  }

  /**
   * Return the full data set. If the structure hasn't yet been determined
   * by a call to getStructure then method should do so before processing
   * the rest of the data set.
   *
   * @return the structure of the data set as an empty set of Instances
   * @exception IOException if there is no source or parsing fails
   */
  public Instances getDataSet() throws IOException {
    if (m_sourceFile == null) {
      throw new IOException("No source has been specified");
    }
    //    m_sourceReader.close();
    setSource(m_sourceFile);
    BufferedReader br = new BufferedReader(new FileReader(m_sourceFile));
    //    getStructure();
    StreamTokenizer st = new StreamTokenizer(br);
    initTokenizer(st);
    readStructure(st);

    st.ordinaryChar(',');
    st.ordinaryChar('\t');

    m_cumulativeStructure = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      m_cumulativeStructure.addElement(new Hashtable());
    }


    // Instances result = new Instances(m_structure);
    m_cumulativeInstances = new FastVector();
    FastVector current;
    while ((current = getInstance(st)) != null) {
      m_cumulativeInstances.addElement(current);
    }
    br.close();
    // now determine the true structure of the data set
    FastVector atts = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
      String attname = m_structure.attribute(i).name();
      Hashtable tempHash = ((Hashtable)m_cumulativeStructure.elementAt(i));
      if (tempHash.size() == 0) {
	atts.addElement(new Attribute(attname));
      } else {
	FastVector values = new FastVector(tempHash.size());
	// add dummy objects in order to make the FastVector's size == capacity
	for (int z = 0; z < tempHash.size(); z++) {
	  values.addElement("dummy");
	}
	Enumeration e = tempHash.keys();
	while (e.hasMoreElements()) {
	  Object ob = e.nextElement();
	  //	  if (ob instanceof Double) {
	  int index = ((Integer)tempHash.get(ob)).intValue();
	  values.setElementAt(new String(ob.toString()), index);
	  //	  }
	}
	atts.addElement(new Attribute(attname, values));
      }
    }

    // make the instances
    Instances dataSet = new Instances(m_sourceFile.getName(),
				      atts,
				      m_cumulativeInstances.size());

    for (int i = 0; i < m_cumulativeInstances.size(); i++) {
      current = ((FastVector)m_cumulativeInstances.elementAt(i));
      double [] vals = new double[dataSet.numAttributes()];
      for (int j = 0; j < current.size(); j++) {
	Object cval = current.elementAt(j);
	if (cval instanceof String) {
	  if (((String)cval).compareTo("?") == 0) {
	    vals[j] = Instance.missingValue();
	  } else {
	    if (!dataSet.attribute(j).isNominal()) {
	      System.err.println("Wrong attribute type!!!");
	      System.exit(1);
	    }
	    // find correct index
	    Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
	    int index = ((Integer)lookup.get(cval)).intValue();
	    vals[j] = (double)index;
	  }
	} else if (dataSet.attribute(j).isNominal()) {
	  // find correct index
	  Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
	  int index = ((Integer)lookup.get(cval)).intValue();
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -