⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 miningcsvstream.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

 /**
  * Title: XELOPES Data Mining Library
  * Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
  * Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
  * Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
  * @author Toni Volkmer (volkmer@prudsys.com)
  * @version 1.0
  */

package com.prudsys.pdm.Input.Records.Csv;

import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Vector;

import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningFileStream;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Utils.ParseFileString;


 /**
  * Extends MiningFileStream for Comma Separated (CSV) Files.
  * Common file format.
  */
public class MiningCsvStream extends MiningFileStream
{
  	//<<Frank J. Xu, 16/02/2005
    //Add function to update the type of categorical data. 
	//protected int m_BoundedThreshold = 500;    	
	//>>Frank J. Xu, 16/02/2005

   // -----------------------------------------------------------------------
   //  Constants of outlier and missing value treatment
   // -----------------------------------------------------------------------
   /**
    * Default value for {@link #columnNameType}.
    */
   public static final short COLUMN_NAME_AUTOMATIC_MODE = 0;

   /**
    * Possible value for {@link #columnNameType}.
    * The first line's field values are used as name for the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes}.
    */
   public static final short COLUMN_NAME_FIRST_LINE = 1;

   /**
    * Possible value for {@link #columnNameType}.
    * The names of the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes} will not be read from file but created like <i>field1</i> for the first field.
    */
   public static final short COLUMN_NAME_CREATE = 2;

   /** Unbounded categories. */
   public static final short CATEGORIES_UNBOUNDED = 0;

   /** Unstored categories. */
   public static final short CATEGORIES_UNSTORED = 1;

   // -----------------------------------------------------------------------
   //  Variables declarations
   // -----------------------------------------------------------------------
  /** Number of lines to go through to search for separator and attributes type */
  protected int nLines = 100;

  /** Index of the current line. 1 means first line. */
  private int curLine = 0;

  /** Character that separates fields */
  protected Character sepChar = null;

  /**
   * Contains the string values used as missing values.
   * Default values are question mark, empty string and single space character.
   */
  protected String[] usedAsMissingValues = new String[]{""," "};

//  protected boolean missingValues = false;

  protected Character quotationMark = null;

  /**
   * Defines how the names for the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes} are set. <br>
   * {@link #COLUMN_NAME_AUTOMATIC_MODE} is the default value which will use either
   * {@link #COLUMN_NAME_FIRST_LINE COLUMN_NAME_FIRST_LINE} or
   * {@link #COLUMN_NAME_CREATE}.<br>
   * Can be read/set by using {@link #getColumnNameType} and
   * {@link #setColumnNameType(short)}.
   */
  protected short columnNameType = COLUMN_NAME_AUTOMATIC_MODE;

  /**
   * Contains the currently used {@link #columnNameType column name type}. <br>
   * If {@link #COLUMN_NAME_AUTOMATIC_MODE} is set in {@link #columnNameType},
   * <i>usedColumnNameType</i> contains the actually used type.<br>
   * Otherwise <i>usedColumnNameType</i> is always equal to {@link #columnNameType}.
   */
  protected short usedColumnNameType = columnNameType;

  protected short categoriesType = CATEGORIES_UNBOUNDED;

  /**
   * Internally used reader.
   */
  BufferedReader inReader;

  /**
   * All the parsing is done by {@link com.prudsys.pdm.Utils.ParseFileString ParseFileString}.
   */
  protected ParseFileString parse = new ParseFileString();

  // -----------------------------------------------------------------------
  //  Constructors
  // -----------------------------------------------------------------------
  /**
   * Empty constructor.
   */
  public MiningCsvStream()
  {
  }

  /**
   * Mining Csv stream for a given Csv file and given meta data.
   *
   * @param dataFileName path of Csv file to access
   * @param dataSpecification meta data of file data
   * @throws MiningException
   */
  public MiningCsvStream( String dataFileName, MiningDataSpecification dataSpecification ) throws MiningException
  {
      super( dataFileName, dataSpecification );
      fileName = dataFileName;
//      initTokenizer( reader );
  }

  /**
   * Mining file stream for a given file.
   * The meta data is automatically determined when the {@link #open} method is called.
   *
   * @param file path of file to access
   * @throws MiningException
   */
  public MiningCsvStream( String file ) throws MiningException
  {
      this( file, null );
  }

  // -----------------------------------------------------------------------
  //  Getter and setter methods
  // -----------------------------------------------------------------------
  /**
   * Returns supported strteam methods.
   *
   * @return supported stream methods
   */
  public java.util.Enumeration getSupportedStreamMethods() {

    Vector suppmeth = new Vector();
    suppmeth.addElement("recognize");
    suppmeth.addElement("reset");
    suppmeth.addElement("updateSetMetaData");
    return suppmeth.elements();
  }

  /**
   * Determines if the parameter is a missing value.
   * In order to do that it compares the parameter with all values in
   * {@link #usedAsMissingValues}.
   *
   * @param val String value that is compared with all missing values.
   * @return <b>true</b> if the parameter is equal to one of the missing values, otherwise <b>false</b>.
   */
  protected boolean isMissingValue(String val) {
    if(val==null)
      return true;

    if(usedAsMissingValues==null)
      return false;

    for(int i=0; i < usedAsMissingValues.length; i++) {
      if (usedAsMissingValues[i] != null && usedAsMissingValues[i].equals(val))
        return true;

      if(usedAsMissingValues[i] != null && usedAsMissingValues[i].equals(" ") && val.trim().equals(""))
      // several space characters are treated as missing value
          return true;
    }

    return false;
  }

  /**
   * Gets the number of currently used lines to determine the separator, quotation mark character and attribute type.
   * @return number of lines.
   * @see #nLines
   */
  public int getNumberTestLines() {
    return nLines;
  }

  /**
   * Sets the number of currently used lines to determine the separator, quotation mark character and attribute type.
   * @param n new number of test lines.
   * @see nLines
   */
  public void setNumberTestLines(int n) {
    if(n<1)
      this.nLines = -1;
    else
      this.nLines = n;
  }

  /**
   * Gets the separator used for parsing the input file.
   * If the separator has not been set, Unicode character 0 is returned.
   *
   * @return separator as char
   */
  public char getSeparatorAsChar() {
    if(sepChar!=null)
      return sepChar.charValue();
    else
      return '\0';
  }

  /**
   * Gets the separator used for parsing the input file.
   * If the separator has not been set, null is returned.
   *
   * @return separator as Character object
   */
  public Character getSeparator() {
    return this.sepChar;
  }

  /**
   * Sets the separator used for parsing the input file.
   *
   * @param separator separator
   */
  public void setSeparator(char separator) {
    if(separator=='\0')
      this.sepChar = null;
    else
      this.sepChar = new Character(separator);
  }

  /**
   * Sets the separator used for parsing the input file.
   *
   * @param separator separator
   */
  public void setSeparator(Character separator) {
    this.sepChar = separator;
  }

  /**
   * Returns all string values used as missing values.
   * The missing values are kept in {@link #usedAsMissingValues}.
   *
   * @return missing values as String array
   */
  public String[] getMissingValues() {
    return this.usedAsMissingValues;
  }

  /**
   * Sets the string values used as missing values.
   *
   * @param values missing values as String array.
   */
  public void setMissingValues(String[] values) {
    this.usedAsMissingValues = values;
  }

  public short getCategoriesType() {
    return this.categoriesType;
  }

  public void setCategoriesType(short type) {
    if(type < 0 || type > 1)
      this.categoriesType = CATEGORIES_UNBOUNDED;
    else
      this.categoriesType = type;
  }

  /**
   * Gets the quotation mark character used for parsing the input file.
   * If the quotation mark character has not been set, '\0' is returned.
   *
   * @return quotation mark character as Character object
   */
  public char getQuotationMarkAsChar() {
    if(this.quotationMark==null)
      return '\0';
    else
      return this.quotationMark.charValue();
  }

  /**
   * Gets the quotation mark character used for parsing the input file.
   * If the quotation mark character has not been set, null is returned.
   *
   * @return quotation mark character as Character object
   */
  public Character getQuotationMark() {
    return this.quotationMark;
  }

  /**
   * Sets the quotation mark character used for parsing the input file.
   *
   * @param c quotation mark character
   */
  public void setQuotationMark(char c) {
    if(c=='\0')
      this.quotationMark = null;
    else
      this.quotationMark = new Character(c);
  }

  /**
   * Sets the quotation mark character used for parsing the input file.
   *
   * @param c quotation mark character
   */
  public void setQuotationMark(Character c) {
    this.quotationMark = c;
  }

  /**
   * Gets the type how to set the name of the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes}.
   * Possible values are {@link #COLUMN_NAME_AUTOMATIC_MODE},
   * {@link #COLUMN_NAME_CREATE} and {@link #COLUMN_NAME_FIRST_LINE}
   *
   * @return type
   */
  public short getColumnNameType() {
    return this.columnNameType;
  }

  /**
   * Sets the type how to set the name of the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes}.
   *
   * @param type {@link #COLUMN_NAME_AUTOMATIC_MODE}, {@link #COLUMN_NAME_CREATE} or {@link #COLUMN_NAME_FIRST_LINE}
   */
  public void setColumnNameType(short type) {
    if(type >= 0  &&  type <= 2)
      this.columnNameType = type;
    else
      this.columnNameType = MiningCsvStream.COLUMN_NAME_AUTOMATIC_MODE;
  }

  // -----------------------------------------------------------------------
  //  General stream methods
  // -----------------------------------------------------------------------
  /**
   * Opens mining file stream.
   * If no separator has been set yet, it will be determined.
   * If no meta data has been set yet, it will be determined by calling
   * the {@link #recognize} method.
   *
   * @exception MiningException if a mining source access error occurs
   */
  public void open() throws MiningException {

    usedColumnNameType = columnNameType;
    parse.setColumnSeperator(this.getSeparatorAsChar());
    parse.setColumnQuotes(this.getQuotationMarkAsChar());
    if(this.getSeparator()==null) { // look for it
      Vector testLines = new Vector();

      try {
        this.reset();
        for(int i=0; (i==-1 || i < this.nLines) && reader.ready(); i++)
          testLines.addElement(inReader.readLine()); // has to be changed!
        this.reset();
      }
      catch (java.io.IOException ex) {
        ex.printStackTrace();
      }
      if(testLines.size() < 1)
        throw new MiningException("Error at reading line");

      // Convert vector to String array
      String testLinesArray[] = new String[testLines.size()];
      for(int i=0; i < testLinesArray.length; i++)
        testLinesArray[i] = (String) testLines.elementAt(i);

      if(this.getQuotationMark()!=null)
        parse.setColumnQuotes(this.getQuotationMarkAsChar());

        if (!parse.findParameters(testLinesArray))
          throw new MiningException(
              "MiningCsvStream: Unabled to find a separator.");

        this.setSeparator(parse.getColumnSeperator());

        // If the quotation mark has been set but not the separator,
        // it can have changed.
        this.setQuotationMark(parse.getColumnQuotes());

    }
    else { // separator set
      reset();
//      inReader = new BufferedReader(reader);
    }

    if(this.getMetaData()==null) {
      this.updateSetMetaData(recognize());
      reset(); // not necessary after recognize, but secure
    }
  }

  /**
   * Closes mining file stream.
   *
   * @exception MiningException if a mining source access error occurs
   */
  public void close() throws MiningException {

    super.close();
    cursorVector = null;
    metaData = null;
    inReader = null;
  }

  /**
   * Calls {@link #recognize(short)} in order to create meta data.
   * Therefore the first {@link #nLines} lines are read.
   *
   * @return meta data obtained
   * @throws MiningException could not recognize file
   */
  public MiningDataSpecification recognize() throws MiningException
  {
      usedColumnNameType = columnNameType;

      if(usedColumnNameType==MiningCsvStream.COLUMN_NAME_AUTOMATIC_MODE) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -