📄 filedatarowreader.java
字号:
/* * YALE - Yet Another Learning Environment * Copyright (C) 2002, 2003 * Simon Fischer, Ralf Klinkenberg, Ingo Mierswa, * Katharina Morik, Oliver Ritthoff * Artificial Intelligence Unit * Computer Science Department * University of Dortmund * 44221 Dortmund, Germany * email: yale@ls8.cs.uni-dortmund.de * web: http://yale.cs.uni-dortmund.de/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */package edu.udo.cs.yale.example;import edu.udo.cs.yale.tools.LogService;import edu.udo.cs.yale.tools.Ontology;import edu.udo.cs.yale.tools.att.AttributeDataSource;import java.util.Map;import java.util.HashMap;import java.util.List;import java.util.LinkedList;import java.util.Iterator;import java.io.File;import java.io.FileReader;import java.io.FileNotFoundException;import java.io.IOException;import java.io.StreamTokenizer;/** FileDataRowReader implements a DataRowReade that reads DataRows from a file. * * @author Simon, Ingo * @version $Id: FileDataRowReader.java,v 2.4 2003/07/30 13:16:45 fischer Exp $ */public class FileDataRowReader extends AbstractDataRowReader { private static final int FILE_NR = 0; private static final int TOKEN_NR = 1; /** Reader for the labels. */ private StreamTokenizer[] attributeDataTokenizer; private Attribute[] attributes; /** Remember if an end of file has occured. */ private boolean eof; /** Remember if a line has already been read. */ private boolean lineRead; /** The maximum number of attributes to read. */ private int maxNumber; /** The number of lines read. */ private int linesRead = 0; private String[][] currentData; /** Array of size [number of attributes][2]. For each attribute i the value of * dataSourceIndex[i][FILE_NR] is used as an index to {@link #attributeDataTokenizer} and * the value of dataSourceIndex[i][TOKEN_NR] specifies the index of the token to use for * attribute i. */ private int[][] dataSourceIndex; /** Constructs a new FileDataRowReader. * @param factory Factory used to create data rows. * @param attributeDataSources List of {@link AttributeDataSource}s. * @param sampleSize Limit sample to the first sampleSize lines read from files. -1 for no limit. */ public FileDataRowReader(DataRowFactory factory, List attributeDataSources, int sampleSize, char[] separators, char[] commentChars, char[] ignoreChars) throws IOException { super(factory); this.maxNumber = sampleSize; this.attributes = new Attribute[attributeDataSources.size()]; this.dataSourceIndex = new int[attributeDataSources.size()][2]; List tokenizerList = new LinkedList(); // map all files used to indices Map fileMap = new HashMap(); Iterator i = attributeDataSources.iterator(); int attribute = 0; int greatestFileIndex = -1; while (i.hasNext()) { AttributeDataSource ads = (AttributeDataSource)i.next(); attributes[attribute] = ads.getAttribute(); File file = ads.getFile(); Integer fileIndex = (Integer)fileMap.get(file); // new file found? -> create tokenizer and map to index number if (fileIndex == null) { fileIndex = new Integer(++greatestFileIndex); fileMap.put(file, fileIndex); tokenizerList.add(makeTokenizer(file, separators, commentChars, ignoreChars)); } dataSourceIndex[attribute][FILE_NR] = fileIndex.intValue(); dataSourceIndex[attribute][TOKEN_NR] = ads.getColumn(); attribute++; } // determine maximal token index used this.attributeDataTokenizer = new StreamTokenizer[tokenizerList.size()]; tokenizerList.toArray(this.attributeDataTokenizer); int[] maxTokenIndex = new int[this.attributeDataTokenizer.length]; for (attribute = 0; attribute < dataSourceIndex.length; attribute++) { if (dataSourceIndex[attribute][TOKEN_NR] > maxTokenIndex[dataSourceIndex[attribute][FILE_NR]]) { maxTokenIndex[dataSourceIndex[attribute][FILE_NR]] = dataSourceIndex[attribute][TOKEN_NR]; } } // create temporary string array to store tokens in currentData = new String[this.attributeDataTokenizer.length][]; for (int t = 0; t < maxTokenIndex.length; t++) { currentData[t] = new String[maxTokenIndex[t]+1]; } } // /** Constructs a new FileDataRowReader that reads from the given tokenizers.// * @param attributeTokenizer an array of tokenizers, one for each file// * @param dataSourceIndex array of size [attributeList.size()][2]. For each attribute i// * it gives the number of the tokenizer dataSourceIndex[i][0] and// * the number of the token dataSourceIndex[i][1] to read the data from. */// public FileDataRowReader(DataRowFactory factory,// List attributeList,// StreamTokenizer[] attributeDataTokenizer,// int[][] dataSourceIndex,// int maxNumber) {// super(factory);// this.lineRead = false;// this.maxNumber = maxNumber;// this.dataSourceIndex = dataSourceIndex;// this.attributeDataTokenizer = attributeDataTokenizer;// this.linesRead = 0;// this.attributes = new Attribute[attributeList.size()];// attributeList.toArray(this.attributes);// this.currentData = new String[attributeDataTokenizer.length][];// for (int i = 0; i < currentData.length; i++) {// int maxCol = 0;// for (int j = 0;j < dataSourceIndex.length; j++)// if (dataSourceIndex[j][0] == i)// maxCol = Math.max(maxCol, dataSourceIndex[j][1]);// this.currentData[i] = new String[maxCol+1];// }// } /** Delivers a <tt>StreamTokenizer</tt> with the default syntax and additionally some syntax enhancements * done by the user. */ public static StreamTokenizer makeTokenizer(File file, char[] separators, char[] commentChars, char[] ignoreChars) throws FileNotFoundException { StreamTokenizer tokenizer = null; if (file != null) { tokenizer = new StreamTokenizer(new FileReader(file)); // resets the syntax of the tokenizer. tokenizer.resetSyntax(); // token characters tokenizer.wordChars(128, 255); // schmuh tokenizer.wordChars('a', 'z'); // a-z tokenizer.wordChars('A', 'Z'); // A-Z tokenizer.wordChars('0', '9'); // 0-9 tokenizer.wordChars('+', '+'); // + tokenizer.wordChars('-', '-'); // - tokenizer.wordChars('_', '_'); // _ tokenizer.wordChars('.', '.'); // . tokenizer.wordChars('?', '?'); // ? // default whitespace characters tokenizer.whitespaceChars('\u0000', '\u0020'); // whitespace schmuh // additionally whitespace characters (default is comma) for (int i = 0; i < separators.length; i++) { tokenizer.ordinaryChar(separators[i]); tokenizer.whitespaceChars(separators[i], separators[i]); } // ignore characters (none by default) for (int i = 0; i < ignoreChars.length; i++) { tokenizer.ordinaryChar(ignoreChars[i]); tokenizer.whitespaceChars(ignoreChars[i], ignoreChars[i]); } // quote characters tokenizer.quoteChar('"'); tokenizer.quoteChar('\''); // comment characters for (int i = 0; i < commentChars.length; i++) { tokenizer.ordinaryChar(commentChars[i]); tokenizer.commentChar(commentChars[i]); } tokenizer.eolIsSignificant(true); // end of line is significant and can be asked by TT_EOL } return tokenizer; } public boolean hasNext(){ if ((maxNumber > -1) && (linesRead >= maxNumber)) return false; if (lineRead) return !eof; try { eof = !readLine(); } catch(IOException e) { LogService.logException("FileDataRowReader.hasNext():", e); return false; } lineRead = true; return (!eof); } /** Reads a line of data from all tokenizers. */ private boolean readLine() throws IOException { for (int i = 0; i < attributeDataTokenizer.length; i++) { int column = 0; boolean eol = false; while (!eol && (column < this.currentData[i].length)) { attributeDataTokenizer[i].nextToken(); if (attributeDataTokenizer[i].ttype == attributeDataTokenizer[i].TT_EOF) return false; if (attributeDataTokenizer[i].ttype == attributeDataTokenizer[i].TT_EOL) { if (column != 0) { eol = true; } } else { this.currentData[i][column++] = attributeDataTokenizer[i].sval; } } if (!eol) { while (attributeDataTokenizer[i].ttype != attributeDataTokenizer[i].TT_EOL) attributeDataTokenizer[i].nextToken(); } } return true; } /** Returns the next Example. */ public DataRow next(){ if (eof == true) return null; if (!lineRead) if (!hasNext()) return null; String[] data = new String[attributes.length]; for (int i = 0; i < attributes.length; i++) { if (dataSourceIndex[i][1] == -1) { data[i] = null; } else { data[i] = currentData[dataSourceIndex[i][0]][dataSourceIndex[i][1]]; } } DataRow dataRow = getFactory().create(data, attributes); linesRead++; lineRead = false; return dataRow; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -