miningarffstream.java

来自「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」· Java 代码 · 共 694 行 · 第 1/2 页
JAVA
694 行
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

 /**
  * Title: XELOPES Data Mining Library
  * Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
  * Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
  * Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
  * @author Valentine Stepanenko (valentine.stepanenko@zsoft.ru)
  * @version 1.0
  */

package com.prudsys.pdm.Input.Records.Arff;

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Vector;

import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.CategoryProperty;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataException;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningFileStream;
import com.prudsys.pdm.Input.MiningSparseVector;
import com.prudsys.pdm.Input.MiningVector;

/**
 * Extends MiningFileStream for ARFF files of the
 * WEKA library.
 *
 * Supports the updateble stream type. Attention: The update mode
 * always overwrites the previous data section (but not the meta data).
 */
public class MiningArffStream extends MiningFileStream
{
    // -----------------------------------------------------------------------
    //  Variables declarations
    // -----------------------------------------------------------------------
    /** Tokenizer for reading. */
    protected StreamTokenizer tokenizer;

    /** To indicate if it is parsing meta-data or data **/
    protected final int SYNTAX_MODE_META_DATA = 0; 
    protected final int SYNTAX_MODE_DATA = 1; 
    
    // -----------------------------------------------------------------------
    //  Constructors
    // -----------------------------------------------------------------------
    /**
     * Empty constructor.
     */
    public MiningArffStream()
    {
    }

    /**
     * Mining ARFF stream for a given ARFF file and given meta data.
     *
     * @param dataFileName path of ARFF file to access
     * @param metaData meta data of file data
     */
    public MiningArffStream( String dataFileName, MiningDataSpecification metaData ) throws MiningException
    {
        super( dataFileName, metaData );
        fileName = dataFileName;
        if( metaData == null )
        {
            metaData = recognize();
        }
        reset();
    }

    /**
     * Mining file stream for a given file.
     * The meta data is automatically determined using the
     * method recognize which reads the ARFF header.
     *
     * @param dataFileName path of ARFF file to access
     */
    public MiningArffStream( String dataFileName ) throws MiningException
    {
        super( dataFileName );
    }

    // -----------------------------------------------------------------------
    //  Getter and setter methods
    // -----------------------------------------------------------------------
    /**
     * Returns supported stream methods.
     *
     * @return supported stream methods
     */
    public Enumeration getSupportedStreamMethods() {

      Vector suppmeth = new Vector();
      suppmeth.addElement("recognize");
      suppmeth.addElement("reset");
      suppmeth.addElement("updateSetMetaData");
      suppmeth.addElement("updateRemoveAllVectors");
      suppmeth.addElement("updateAppendVector");
      return suppmeth.elements();
    }

    // -----------------------------------------------------------------------
    //  General stream methods
    // -----------------------------------------------------------------------
    /**
     * Reads and stores header of an ARFF file.
     *
     * @return meta data of ARFF file
     * @exception MiningException if the information is not read
     * successfully
    */
    public MiningDataSpecification recognize() throws MiningException
    {
        if( metaData == null )
        {
            super.reset();
            // Tokenizer syntax mode is meta-data at begin
            initTokenizer( reader , SYNTAX_MODE_META_DATA);
            metaData = new MiningDataSpecification();
            String attributeName;
            ArrayList attributeValues;
            String token;
            // Get name of relation.
            token = getNextToken();
            while( token.equalsIgnoreCase( "endofline" ) )
            {
                token = getNextToken();
            }
            if(token.equalsIgnoreCase("@relation"))
            {
                token = getNextToken();
                if( token.equalsIgnoreCase( "endoffile" ) || token.equalsIgnoreCase( "endofline" ) )
                {
                    tokenizerException("premature end of line or file");
                }
                metaData.setRelationName( token );
                token = getNextToken();
                if( !token.equalsIgnoreCase( "endofline" ) )
                {
                    if( !token.equalsIgnoreCase( "endoffile" ) )
                    {
                        tokenizerException("end of line expected");
                    }
                }
            }
            else
            {
                tokenizerException("keyword @relation expected");
            }
            // Get attribute declarations.
            token = getNextToken();
            while( token.equalsIgnoreCase( "endofline" ) )
            {
                token = getNextToken();
            }
            if( token.equalsIgnoreCase( "endoffile" ) )
            {
                tokenizerException("premature end of file");
            }
            while( token.equalsIgnoreCase("@attribute"))
            {
                // Get attribute name.
                token = getNextToken();
                if( token.equalsIgnoreCase( "endoffile" ) || token.equalsIgnoreCase( "endofline" ) )
                {
                    tokenizerException("premature end of line or file");
                }
                attributeName = token;
                token = getNextToken();
                if( token.equalsIgnoreCase( "endoffile" ) || token.equalsIgnoreCase( "endofline" ) )
                {
                    tokenizerException("premature end of line or file");
                }
                // Check if attribute is nominal.
                if( token.equalsIgnoreCase("{") )
                {
                    // Attribute is nominal.
                    attributeValues = new ArrayList();
                    CategoricalAttribute categoricalAttribute = new CategoricalAttribute( attributeName );
                    token = getNextToken();
                    while( !token.equalsIgnoreCase( "}" ) )
                    {
                        if( token.equalsIgnoreCase( "endofline" ) )
                        {
                            tokenizerException("} expected at end of enumeration");
                        }
                        else
                        {
                            categoricalAttribute.addCategory( new Category( token, token, new CategoryProperty() ) );
                        }
                        token = getNextToken();
                    }
                    
                    //<<tyleung 23/3/2005
//                    if (categoricalAttribute.getCategoriesNumber() == 0)
//                    {
//                        tokenizerException("no nominal values found");
//                    }
                    //tyleung 23/3/2005 >>
                    metaData.addMiningAttribute( categoricalAttribute );
                }
                else
                {
                    // Attribute is real, integer, or string.
                    if ( token.equalsIgnoreCase("real") || token.equalsIgnoreCase("integer") || token.equalsIgnoreCase("numeric"))
                    {
                        NumericAttribute numAtt = new NumericAttribute( attributeName );
                        // default data type is double
                        if ( token.equalsIgnoreCase("integer") )
                          numAtt.setDataType( NumericAttribute.INTEGER );
                        metaData.addMiningAttribute( numAtt );
                    }
                    else
                    if (token.equalsIgnoreCase("string"))
                    {
                        CategoricalAttribute stringAttribute = new CategoricalAttribute( attributeName );
                        stringAttribute.setUnboundedCategories(true);
                        metaData.addMiningAttribute( stringAttribute );
                    }
                    else
                    {
                        tokenizerException("no valid attribute type or invalid "+ "enumeration");
                    }
                }
                token = getNextToken();
                if( !token.equalsIgnoreCase( "endofline" ) )
                {
                    if( !token.equalsIgnoreCase( "endoffile" ) )
                    {
                        tokenizerException( "end of line expected");
                    }
                }
                token = getNextToken();
                while( token.equalsIgnoreCase( "endofline" ) )
                {
                    token = getNextToken();
                }
            }
            // Check if any attributes have been declared.
            if( metaData.getAttributesNumber() == 0 )
            {
                tokenizerException("no attributes declared");
            }
        }
        reset();
        return metaData;
    }

    // -----------------------------------------------------------------------
    //  Methods of cursor positioning
    // -----------------------------------------------------------------------
    /**
     * Places the cursor before first row.
     * This is done by closing and reopening the file reader.
     */
    public void reset() throws MiningException
    {
        super.reset();
        initTokenizer( reader ,0);
        String token = getNextToken();
        while( !token.equalsIgnoreCase( "@data" ) )
        {
            token = getNextToken();
        }
        // Change the syntax mode to DATA
        //resetTokenizerSyntax(SYNTAX_MODE_DATA);
        cursorPosition = -1;
    }

    /**
     * Advance cursor by one position.
     *
     * @return true if next vector exists, else false
     */
    public boolean next() throws MiningException
    {
    	// Check if end of file reached.
        String token = getNextToken();
        while( token.equalsIgnoreCase( "endofline" ) )
        {
            token = getNextToken();
        }
        if( token.equalsIgnoreCase( "endoffile" ) )
        {
            return false;
        }
        if( !token.equals( "{" ) )
        {
            double[] instance = new double[ metaData.getAttributesNumber() ];
            // Get values for all attributes.
            for (int i = 0; i < metaData.getAttributesNumber(); i++)
            {
                // Get next token
                if (i > 0)
                {
                    token = getNextToken();
                }
                //<<09/03/2005, Frank J. Xu
                //do not handle this kind of error, except throw exceptions. 
                //Bug: 124,It shows error message: "end of line expected, read token[4,9], line 75]" when 
				//running source node with \\etifs2\BI\96 Sample Data\arff\iris-error.arff. The 
				//first data vector contains an empty data i.e. 5.1,3.5,1.4,0.2,,0 
                if(token.equalsIgnoreCase( "endofline" ) )
                {
                	int startIndex = tokenizer.toString().lastIndexOf(" ")-4;
                	int endIndex = tokenizer.toString().length();
                	String errorMsg = tokenizer.toString().substring(startIndex, endIndex);
                	throw new MiningDataException("Invalid characters in " + errorMsg + ".");
                }
                //>>09/03/2005, Frank J. Xu               
                MiningAttribute attribute = metaData.getMiningAttribute( i );
                if (token.length()<=0)
                {
                	instance[i] = Category.MISSING_VALUE;
                } else if (attribute instanceof CategoricalAttribute)
                {
                  Category cat = new Category(token);
                  instance[i] = ((CategoricalAttribute)attribute).getKey( cat );
// ++++++++++++++ Change by M. Thess: if token is missing value ("?"), do not add to string attribute:
                  if ( Category.isMissingValue(instance[i]) && ((CategoricalAttribute)attribute).isUnboundedCategories() && !token.equals("?") )
                  {
                    instance[i] = ((CategoricalAttribute)attribute).addCategory( cat );
miningarffstream.java - 源码说明

本页面展示了「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」中的 miningarffstream.java 源码文件，采用 Java 编程语言编写，共 694 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ALPHAMINERR相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?