📄 arffloader.java
字号:
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * ArffLoader.java * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand * */package weka.core.converters;import weka.core.Attribute;import weka.core.FastVector;import weka.core.Instance;import weka.core.Instances;import weka.core.SparseInstance;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.Reader;import java.io.StreamTokenizer;import java.io.StringReader;import java.net.URL;import java.text.ParseException;import java.util.zip.GZIPInputStream;/** <!-- globalinfo-start --> * Reads a source that is in arff (attribute relation file format) format. * <p/> <!-- globalinfo-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 1.19 $ * @see Loader */public class ArffLoader extends AbstractFileLoader implements BatchConverter, IncrementalConverter, URLSourcedLoader { /** for serialization */ static final long serialVersionUID = 2726929550544048587L; /** the file extension */ public static String FILE_EXTENSION = Instances.FILE_EXTENSION; /** the extension for compressed files */ public static String FILE_EXTENSION_COMPRESSED = FILE_EXTENSION + ".gz"; /** the url */ protected String m_URL = "http://"; /** The reader for the source file. */ protected transient Reader m_sourceReader = null; /** The parser for the ARFF file */ protected transient ArffReader m_ArffReader = null; /** * Reads data from an ARFF file, either in incremental or batch mode. <p/> * * Typical code for batch usage: * <pre> * BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff")); * ArffReader arff = new ArffReader(reader); * Instances data = arff.getData(); * data.setClassIndex(data.numAttributes() - 1); * </pre> * * Typical code for incremental usage: * <pre> * BufferedReader reader = new BufferedReader(new FileReader("/some/where/file.arff")); * ArffReader arff = new ArffReader(reader, 1000); * Instances data = arff.getStructure(); * data.setClassIndex(data.numAttributes() - 1); * Instance inst; * while ((inst = arff.readInstance(data)) != null) { * data.add(inst); * } * </pre> * * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @author Len Trigg (trigg@cs.waikato.ac.nz) * @author fracpete (fracpete at waikato dot ac dot nz) * @version $Revision: 1.19 $ */ public static class ArffReader { /** the tokenizer for reading the stream */ protected StreamTokenizer m_Tokenizer; /** Buffer of values for sparse instance */ protected double[] m_ValueBuffer; /** Buffer of indices for sparse instance */ protected int[] m_IndicesBuffer; /** the actual data */ protected Instances m_Data; /** the number of lines read so far */ protected int m_Lines; /** * Reads the data completely from the reader. The data can be accessed * via the <code>getData()</code> method. * * @param reader the reader to use * @throws IOException if something goes wrong * @see #getData() */ public ArffReader(Reader reader) throws IOException { m_Tokenizer = new StreamTokenizer(reader); initTokenizer(); readHeader(1000); initBuffers(); Instance inst; while ((inst = readInstance(m_Data)) != null) { m_Data.add(inst); }; compactify(); } /** * Reads only the header and reserves the specified space for instances. * Further instances can be read via <code>readInstance()</code>. * * @param reader the reader to use * @param capacity the capacity of the new dataset * @throws IOException if something goes wrong * @throws IllegalArgumentException if capacity is negative * @see #getStructure() * @see #readInstance(Instances) */ public ArffReader(Reader reader, int capacity) throws IOException { if (capacity < 0) throw new IllegalArgumentException("Capacity has to be positive!"); m_Tokenizer = new StreamTokenizer(reader); initTokenizer(); readHeader(capacity); initBuffers(); } /** * Reads the data without header according to the specified template. * The data can be accessed via the <code>getData()</code> method. * * @param reader the reader to use * @param template the template header * @param lines the lines read so far * @throws IOException if something goes wrong * @see #getData() */ public ArffReader(Reader reader, Instances template, int lines) throws IOException { this(reader, template, lines, 100); Instance inst; while ((inst = readInstance(m_Data)) != null) { m_Data.add(inst); }; compactify(); } /** * Initializes the reader without reading the header according to the * specified template. The data must be read via the * <code>readInstance()</code> method. * * @param reader the reader to use * @param template the template header * @param lines the lines read so far * @param capacity the capacity of the new dataset * @throws IOException if something goes wrong * @see #getData() */ public ArffReader(Reader reader, Instances template, int lines, int capacity) throws IOException { m_Lines = lines; m_Tokenizer = new StreamTokenizer(reader); initTokenizer(); m_Data = new Instances(template, capacity); initBuffers(); } /** * initializes the buffers for sparse instances to be read * * @see #m_ValueBuffer * @see #m_IndicesBuffer */ protected void initBuffers() { m_ValueBuffer = new double[m_Data.numAttributes()]; m_IndicesBuffer = new int[m_Data.numAttributes()]; } /** * compactifies the data */ protected void compactify() { if (m_Data != null) m_Data.compactify(); } /** * Throws error message with line number and last token read. * * @param msg the error message to be thrown * @throws IOException containing the error message */ protected void errorMessage(String msg) throws IOException { String str = msg + ", read " + m_Tokenizer.toString(); if (m_Lines > 0) { int line = Integer.parseInt(str.replaceAll(".* line ", "")); str = str.replaceAll(" line .*", " line " + (m_Lines + line - 1)); } throw new IOException(str); } /** * returns the current line number * * @return the current line number */ public int getLineNo() { return m_Lines + m_Tokenizer.lineno(); } /** * Gets next token, skipping empty lines. * * @throws IOException if reading the next token fails */ protected void getFirstToken() throws IOException { while (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) {}; if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) { m_Tokenizer.ttype = StreamTokenizer.TT_WORD; } else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) && (m_Tokenizer.sval.equals("?"))){ m_Tokenizer.ttype = '?'; } } /** * Gets index, checking for a premature and of line. * * @throws IOException if it finds a premature end of line */ protected void getIndex() throws IOException { if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) { errorMessage("premature end of line"); } if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } } /** * Gets token and checks if its end of line. * * @param endOfFileOk whether EOF is OK * @throws IOException if it doesn't find an end of line */ protected void getLastToken(boolean endOfFileOk) throws IOException { if ((m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) && ((m_Tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) { errorMessage("end of line expected"); } } /** * Gets next token, checking for a premature and of line. * * @throws IOException if it finds a premature end of line */ protected void getNextToken() throws IOException { if (m_Tokenizer.nextToken() == StreamTokenizer.TT_EOL) { errorMessage("premature end of line"); } if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } else if ((m_Tokenizer.ttype == '\'') || (m_Tokenizer.ttype == '"')) { m_Tokenizer.ttype = StreamTokenizer.TT_WORD; } else if ((m_Tokenizer.ttype == StreamTokenizer.TT_WORD) && (m_Tokenizer.sval.equals("?"))){ m_Tokenizer.ttype = '?'; } } /** * Initializes the StreamTokenizer used for reading the ARFF file. */ protected void initTokenizer(){ m_Tokenizer.resetSyntax(); m_Tokenizer.whitespaceChars(0, ' '); m_Tokenizer.wordChars(' '+1,'\u00FF'); m_Tokenizer.whitespaceChars(',',','); m_Tokenizer.commentChar('%'); m_Tokenizer.quoteChar('"'); m_Tokenizer.quoteChar('\''); m_Tokenizer.ordinaryChar('{'); m_Tokenizer.ordinaryChar('}'); m_Tokenizer.eolIsSignificant(true); } /** * Reads a single instance using the tokenizer and returns it. * * @param structure the dataset header information, will get updated * in case of string or relational attributes * @return null if end of file has been reached * @throws IOException if the information is not read * successfully */ public Instance readInstance(Instances structure) throws IOException { return readInstance(structure, true); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -