📄 miningcsvstream.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Toni Volkmer (volkmer@prudsys.com)
* @version 1.0
*/
package com.prudsys.pdm.Input.Records.Csv;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Vector;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningFileStream;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Utils.ParseFileString;
/**
* Extends MiningFileStream for Comma Separated (CSV) Files.
* Common file format.
*/
public class MiningCsvStream extends MiningFileStream
{
//<<Frank J. Xu, 16/02/2005
//Add function to update the type of categorical data.
//protected int m_BoundedThreshold = 500;
//>>Frank J. Xu, 16/02/2005
// -----------------------------------------------------------------------
// Constants of outlier and missing value treatment
// -----------------------------------------------------------------------
/**
* Default value for {@link #columnNameType}.
*/
public static final short COLUMN_NAME_AUTOMATIC_MODE = 0;
/**
* Possible value for {@link #columnNameType}.
* The first line's field values are used as name for the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes}.
*/
public static final short COLUMN_NAME_FIRST_LINE = 1;
/**
* Possible value for {@link #columnNameType}.
* The names of the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes} will not be read from file but created like <i>field1</i> for the first field.
*/
public static final short COLUMN_NAME_CREATE = 2;
/** Unbounded categories. */
public static final short CATEGORIES_UNBOUNDED = 0;
/** Unstored categories. */
public static final short CATEGORIES_UNSTORED = 1;
// -----------------------------------------------------------------------
// Variables declarations
// -----------------------------------------------------------------------
/** Number of lines to go through to search for separator and attributes type */
protected int nLines = 100;
/** Index of the current line. 1 means first line. */
private int curLine = 0;
/** Character that separates fields */
protected Character sepChar = null;
/**
* Contains the string values used as missing values.
* Default values are question mark, empty string and single space character.
*/
protected String[] usedAsMissingValues = new String[]{""," "};
// protected boolean missingValues = false;
protected Character quotationMark = null;
/**
* Defines how the names for the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes} are set. <br>
* {@link #COLUMN_NAME_AUTOMATIC_MODE} is the default value which will use either
* {@link #COLUMN_NAME_FIRST_LINE COLUMN_NAME_FIRST_LINE} or
* {@link #COLUMN_NAME_CREATE}.<br>
* Can be read/set by using {@link #getColumnNameType} and
* {@link #setColumnNameType(short)}.
*/
protected short columnNameType = COLUMN_NAME_AUTOMATIC_MODE;
/**
* Contains the currently used {@link #columnNameType column name type}. <br>
* If {@link #COLUMN_NAME_AUTOMATIC_MODE} is set in {@link #columnNameType},
* <i>usedColumnNameType</i> contains the actually used type.<br>
* Otherwise <i>usedColumnNameType</i> is always equal to {@link #columnNameType}.
*/
protected short usedColumnNameType = columnNameType;
protected short categoriesType = CATEGORIES_UNBOUNDED;
/**
* Internally used reader.
*/
BufferedReader inReader;
/**
* All the parsing is done by {@link com.prudsys.pdm.Utils.ParseFileString ParseFileString}.
*/
protected ParseFileString parse = new ParseFileString();
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
/**
* Empty constructor.
*/
public MiningCsvStream()
{
}
/**
* Mining Csv stream for a given Csv file and given meta data.
*
* @param dataFileName path of Csv file to access
* @param dataSpecification meta data of file data
* @throws MiningException
*/
public MiningCsvStream( String dataFileName, MiningDataSpecification dataSpecification ) throws MiningException
{
super( dataFileName, dataSpecification );
fileName = dataFileName;
// initTokenizer( reader );
}
/**
* Mining file stream for a given file.
* The meta data is automatically determined when the {@link #open} method is called.
*
* @param file path of file to access
* @throws MiningException
*/
public MiningCsvStream( String file ) throws MiningException
{
this( file, null );
}
// -----------------------------------------------------------------------
// Getter and setter methods
// -----------------------------------------------------------------------
/**
* Returns supported strteam methods.
*
* @return supported stream methods
*/
public java.util.Enumeration getSupportedStreamMethods() {
Vector suppmeth = new Vector();
suppmeth.addElement("recognize");
suppmeth.addElement("reset");
suppmeth.addElement("updateSetMetaData");
return suppmeth.elements();
}
/**
* Determines if the parameter is a missing value.
* In order to do that it compares the parameter with all values in
* {@link #usedAsMissingValues}.
*
* @param val String value that is compared with all missing values.
* @return <b>true</b> if the parameter is equal to one of the missing values, otherwise <b>false</b>.
*/
protected boolean isMissingValue(String val) {
if(val==null)
return true;
if(usedAsMissingValues==null)
return false;
for(int i=0; i < usedAsMissingValues.length; i++) {
if (usedAsMissingValues[i] != null && usedAsMissingValues[i].equals(val))
return true;
if(usedAsMissingValues[i] != null && usedAsMissingValues[i].equals(" ") && val.trim().equals(""))
// several space characters are treated as missing value
return true;
}
return false;
}
/**
* Gets the number of currently used lines to determine the separator, quotation mark character and attribute type.
* @return number of lines.
* @see #nLines
*/
public int getNumberTestLines() {
return nLines;
}
/**
* Sets the number of currently used lines to determine the separator, quotation mark character and attribute type.
* @param n new number of test lines.
* @see nLines
*/
public void setNumberTestLines(int n) {
if(n<1)
this.nLines = -1;
else
this.nLines = n;
}
/**
* Gets the separator used for parsing the input file.
* If the separator has not been set, Unicode character 0 is returned.
*
* @return separator as char
*/
public char getSeparatorAsChar() {
if(sepChar!=null)
return sepChar.charValue();
else
return '\0';
}
/**
* Gets the separator used for parsing the input file.
* If the separator has not been set, null is returned.
*
* @return separator as Character object
*/
public Character getSeparator() {
return this.sepChar;
}
/**
* Sets the separator used for parsing the input file.
*
* @param separator separator
*/
public void setSeparator(char separator) {
if(separator=='\0')
this.sepChar = null;
else
this.sepChar = new Character(separator);
}
/**
* Sets the separator used for parsing the input file.
*
* @param separator separator
*/
public void setSeparator(Character separator) {
this.sepChar = separator;
}
/**
* Returns all string values used as missing values.
* The missing values are kept in {@link #usedAsMissingValues}.
*
* @return missing values as String array
*/
public String[] getMissingValues() {
return this.usedAsMissingValues;
}
/**
* Sets the string values used as missing values.
*
* @param values missing values as String array.
*/
public void setMissingValues(String[] values) {
this.usedAsMissingValues = values;
}
public short getCategoriesType() {
return this.categoriesType;
}
public void setCategoriesType(short type) {
if(type < 0 || type > 1)
this.categoriesType = CATEGORIES_UNBOUNDED;
else
this.categoriesType = type;
}
/**
* Gets the quotation mark character used for parsing the input file.
* If the quotation mark character has not been set, '\0' is returned.
*
* @return quotation mark character as Character object
*/
public char getQuotationMarkAsChar() {
if(this.quotationMark==null)
return '\0';
else
return this.quotationMark.charValue();
}
/**
* Gets the quotation mark character used for parsing the input file.
* If the quotation mark character has not been set, null is returned.
*
* @return quotation mark character as Character object
*/
public Character getQuotationMark() {
return this.quotationMark;
}
/**
* Sets the quotation mark character used for parsing the input file.
*
* @param c quotation mark character
*/
public void setQuotationMark(char c) {
if(c=='\0')
this.quotationMark = null;
else
this.quotationMark = new Character(c);
}
/**
* Sets the quotation mark character used for parsing the input file.
*
* @param c quotation mark character
*/
public void setQuotationMark(Character c) {
this.quotationMark = c;
}
/**
* Gets the type how to set the name of the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes}.
* Possible values are {@link #COLUMN_NAME_AUTOMATIC_MODE},
* {@link #COLUMN_NAME_CREATE} and {@link #COLUMN_NAME_FIRST_LINE}
*
* @return type
*/
public short getColumnNameType() {
return this.columnNameType;
}
/**
* Sets the type how to set the name of the {@link com.prudsys.pdm.Core.MiningAttribute mining attributes}.
*
* @param type {@link #COLUMN_NAME_AUTOMATIC_MODE}, {@link #COLUMN_NAME_CREATE} or {@link #COLUMN_NAME_FIRST_LINE}
*/
public void setColumnNameType(short type) {
if(type >= 0 && type <= 2)
this.columnNameType = type;
else
this.columnNameType = MiningCsvStream.COLUMN_NAME_AUTOMATIC_MODE;
}
// -----------------------------------------------------------------------
// General stream methods
// -----------------------------------------------------------------------
/**
* Opens mining file stream.
* If no separator has been set yet, it will be determined.
* If no meta data has been set yet, it will be determined by calling
* the {@link #recognize} method.
*
* @exception MiningException if a mining source access error occurs
*/
public void open() throws MiningException {
usedColumnNameType = columnNameType;
parse.setColumnSeperator(this.getSeparatorAsChar());
parse.setColumnQuotes(this.getQuotationMarkAsChar());
if(this.getSeparator()==null) { // look for it
Vector testLines = new Vector();
try {
this.reset();
for(int i=0; (i==-1 || i < this.nLines) && reader.ready(); i++)
testLines.addElement(inReader.readLine()); // has to be changed!
this.reset();
}
catch (java.io.IOException ex) {
ex.printStackTrace();
}
if(testLines.size() < 1)
throw new MiningException("Error at reading line");
// Convert vector to String array
String testLinesArray[] = new String[testLines.size()];
for(int i=0; i < testLinesArray.length; i++)
testLinesArray[i] = (String) testLines.elementAt(i);
if(this.getQuotationMark()!=null)
parse.setColumnQuotes(this.getQuotationMarkAsChar());
if (!parse.findParameters(testLinesArray))
throw new MiningException(
"MiningCsvStream: Unabled to find a separator.");
this.setSeparator(parse.getColumnSeperator());
// If the quotation mark has been set but not the separator,
// it can have changed.
this.setQuotationMark(parse.getColumnQuotes());
}
else { // separator set
reset();
// inReader = new BufferedReader(reader);
}
if(this.getMetaData()==null) {
this.updateSetMetaData(recognize());
reset(); // not necessary after recognize, but secure
}
}
/**
* Closes mining file stream.
*
* @exception MiningException if a mining source access error occurs
*/
public void close() throws MiningException {
super.close();
cursorVector = null;
metaData = null;
inReader = null;
}
/**
* Calls {@link #recognize(short)} in order to create meta data.
* Therefore the first {@link #nLines} lines are read.
*
* @return meta data obtained
* @throws MiningException could not recognize file
*/
public MiningDataSpecification recognize() throws MiningException
{
usedColumnNameType = columnNameType;
if(usedColumnNameType==MiningCsvStream.COLUMN_NAME_AUTOMATIC_MODE) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -