📄 miningarffstream.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Valentine Stepanenko (valentine.stepanenko@zsoft.ru)
* @version 1.0
*/
package com.prudsys.pdm.Input.Records.Arff;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Vector;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.CategoryProperty;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataException;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningFileStream;
import com.prudsys.pdm.Input.MiningSparseVector;
import com.prudsys.pdm.Input.MiningVector;
/**
* Extends MiningFileStream for ARFF files of the
* WEKA library.
*
* Supports the updateble stream type. Attention: The update mode
* always overwrites the previous data section (but not the meta data).
*/
public class MiningArffStream extends MiningFileStream
{
// -----------------------------------------------------------------------
// Variables declarations
// -----------------------------------------------------------------------
/** Tokenizer for reading. */
protected StreamTokenizer tokenizer;
/** To indicate if it is parsing meta-data or data **/
protected final int SYNTAX_MODE_META_DATA = 0;
protected final int SYNTAX_MODE_DATA = 1;
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
/**
* Empty constructor.
*/
public MiningArffStream()
{
}
/**
* Mining ARFF stream for a given ARFF file and given meta data.
*
* @param dataFileName path of ARFF file to access
* @param metaData meta data of file data
*/
public MiningArffStream( String dataFileName, MiningDataSpecification metaData ) throws MiningException
{
super( dataFileName, metaData );
fileName = dataFileName;
if( metaData == null )
{
metaData = recognize();
}
reset();
}
/**
* Mining file stream for a given file.
* The meta data is automatically determined using the
* method recognize which reads the ARFF header.
*
* @param dataFileName path of ARFF file to access
*/
public MiningArffStream( String dataFileName ) throws MiningException
{
super( dataFileName );
}
// -----------------------------------------------------------------------
// Getter and setter methods
// -----------------------------------------------------------------------
/**
* Returns supported stream methods.
*
* @return supported stream methods
*/
public Enumeration getSupportedStreamMethods() {
Vector suppmeth = new Vector();
suppmeth.addElement("recognize");
suppmeth.addElement("reset");
suppmeth.addElement("updateSetMetaData");
suppmeth.addElement("updateRemoveAllVectors");
suppmeth.addElement("updateAppendVector");
return suppmeth.elements();
}
// -----------------------------------------------------------------------
// General stream methods
// -----------------------------------------------------------------------
/**
* Reads and stores header of an ARFF file.
*
* @return meta data of ARFF file
* @exception MiningException if the information is not read
* successfully
*/
public MiningDataSpecification recognize() throws MiningException
{
if( metaData == null )
{
super.reset();
// Tokenizer syntax mode is meta-data at begin
initTokenizer( reader , SYNTAX_MODE_META_DATA);
metaData = new MiningDataSpecification();
String attributeName;
ArrayList attributeValues;
String token;
// Get name of relation.
token = getNextToken();
while( token.equalsIgnoreCase( "endofline" ) )
{
token = getNextToken();
}
if(token.equalsIgnoreCase("@relation"))
{
token = getNextToken();
if( token.equalsIgnoreCase( "endoffile" ) || token.equalsIgnoreCase( "endofline" ) )
{
tokenizerException("premature end of line or file");
}
metaData.setRelationName( token );
token = getNextToken();
if( !token.equalsIgnoreCase( "endofline" ) )
{
if( !token.equalsIgnoreCase( "endoffile" ) )
{
tokenizerException("end of line expected");
}
}
}
else
{
tokenizerException("keyword @relation expected");
}
// Get attribute declarations.
token = getNextToken();
while( token.equalsIgnoreCase( "endofline" ) )
{
token = getNextToken();
}
if( token.equalsIgnoreCase( "endoffile" ) )
{
tokenizerException("premature end of file");
}
while( token.equalsIgnoreCase("@attribute"))
{
// Get attribute name.
token = getNextToken();
if( token.equalsIgnoreCase( "endoffile" ) || token.equalsIgnoreCase( "endofline" ) )
{
tokenizerException("premature end of line or file");
}
attributeName = token;
token = getNextToken();
if( token.equalsIgnoreCase( "endoffile" ) || token.equalsIgnoreCase( "endofline" ) )
{
tokenizerException("premature end of line or file");
}
// Check if attribute is nominal.
if( token.equalsIgnoreCase("{") )
{
// Attribute is nominal.
attributeValues = new ArrayList();
CategoricalAttribute categoricalAttribute = new CategoricalAttribute( attributeName );
token = getNextToken();
while( !token.equalsIgnoreCase( "}" ) )
{
if( token.equalsIgnoreCase( "endofline" ) )
{
tokenizerException("} expected at end of enumeration");
}
else
{
categoricalAttribute.addCategory( new Category( token, token, new CategoryProperty() ) );
}
token = getNextToken();
}
//<<tyleung 23/3/2005
// if (categoricalAttribute.getCategoriesNumber() == 0)
// {
// tokenizerException("no nominal values found");
// }
//tyleung 23/3/2005 >>
metaData.addMiningAttribute( categoricalAttribute );
}
else
{
// Attribute is real, integer, or string.
if ( token.equalsIgnoreCase("real") || token.equalsIgnoreCase("integer") || token.equalsIgnoreCase("numeric"))
{
NumericAttribute numAtt = new NumericAttribute( attributeName );
// default data type is double
if ( token.equalsIgnoreCase("integer") )
numAtt.setDataType( NumericAttribute.INTEGER );
metaData.addMiningAttribute( numAtt );
}
else
if (token.equalsIgnoreCase("string"))
{
CategoricalAttribute stringAttribute = new CategoricalAttribute( attributeName );
stringAttribute.setUnboundedCategories(true);
metaData.addMiningAttribute( stringAttribute );
}
else
{
tokenizerException("no valid attribute type or invalid "+ "enumeration");
}
}
token = getNextToken();
if( !token.equalsIgnoreCase( "endofline" ) )
{
if( !token.equalsIgnoreCase( "endoffile" ) )
{
tokenizerException( "end of line expected");
}
}
token = getNextToken();
while( token.equalsIgnoreCase( "endofline" ) )
{
token = getNextToken();
}
}
// Check if any attributes have been declared.
if( metaData.getAttributesNumber() == 0 )
{
tokenizerException("no attributes declared");
}
}
reset();
return metaData;
}
// -----------------------------------------------------------------------
// Methods of cursor positioning
// -----------------------------------------------------------------------
/**
* Places the cursor before first row.
* This is done by closing and reopening the file reader.
*/
public void reset() throws MiningException
{
super.reset();
initTokenizer( reader ,0);
String token = getNextToken();
while( !token.equalsIgnoreCase( "@data" ) )
{
token = getNextToken();
}
// Change the syntax mode to DATA
//resetTokenizerSyntax(SYNTAX_MODE_DATA);
cursorPosition = -1;
}
/**
* Advance cursor by one position.
*
* @return true if next vector exists, else false
*/
public boolean next() throws MiningException
{
// Check if end of file reached.
String token = getNextToken();
while( token.equalsIgnoreCase( "endofline" ) )
{
token = getNextToken();
}
if( token.equalsIgnoreCase( "endoffile" ) )
{
return false;
}
if( !token.equals( "{" ) )
{
double[] instance = new double[ metaData.getAttributesNumber() ];
// Get values for all attributes.
for (int i = 0; i < metaData.getAttributesNumber(); i++)
{
// Get next token
if (i > 0)
{
token = getNextToken();
}
//<<09/03/2005, Frank J. Xu
//do not handle this kind of error, except throw exceptions.
//Bug: 124,It shows error message: "end of line expected, read token[4,9], line 75]" when
//running source node with \\etifs2\BI\96 Sample Data\arff\iris-error.arff. The
//first data vector contains an empty data i.e. 5.1,3.5,1.4,0.2,,0
if(token.equalsIgnoreCase( "endofline" ) )
{
int startIndex = tokenizer.toString().lastIndexOf(" ")-4;
int endIndex = tokenizer.toString().length();
String errorMsg = tokenizer.toString().substring(startIndex, endIndex);
throw new MiningDataException("Invalid characters in " + errorMsg + ".");
}
//>>09/03/2005, Frank J. Xu
MiningAttribute attribute = metaData.getMiningAttribute( i );
if (token.length()<=0)
{
instance[i] = Category.MISSING_VALUE;
} else if (attribute instanceof CategoricalAttribute)
{
Category cat = new Category(token);
instance[i] = ((CategoricalAttribute)attribute).getKey( cat );
// ++++++++++++++ Change by M. Thess: if token is missing value ("?"), do not add to string attribute:
if ( Category.isMissingValue(instance[i]) && ((CategoricalAttribute)attribute).isUnboundedCategories() && !token.equals("?") )
{
instance[i] = ((CategoricalAttribute)attribute).addCategory( cat );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -