📄 filedatarowreader.java
字号:
/*
* YALE - Yet Another Learning Environment
* Copyright (C) 2001-2004
* Simon Fischer, Ralf Klinkenberg, Ingo Mierswa,
* Katharina Morik, Oliver Ritthoff
* Artificial Intelligence Unit
* Computer Science Department
* University of Dortmund
* 44221 Dortmund, Germany
* email: yale-team@lists.sourceforge.net
* web: http://yale.cs.uni-dortmund.de/
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA.
*/
package edu.udo.cs.yale.example;
import edu.udo.cs.yale.tools.LogService;
import edu.udo.cs.yale.tools.Ontology;
import edu.udo.cs.yale.tools.att.AttributeDataSource;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.LinkedList;
import java.util.Iterator;
import java.io.File;
import java.io.FileReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StreamTokenizer;
/** FileDataRowReader implements a DataRowReader that reads DataRows from a file. This is the main data
* reader for many file formats (including csv) and is used by the ExampleSource operator.
*
* @author Simon, Ingo
* @version $Id: FileDataRowReader.java,v 2.9 2004/09/10 21:52:47 ingomierswa Exp $
*/
public class FileDataRowReader extends AbstractDataRowReader {
private static final int FILE_NR = 0;
private static final int TOKEN_NR = 1;
/** Reader for the labels. */
private StreamTokenizer[] attributeDataTokenizer;
private Attribute[] attributes;
/** Remember if an end of file has occured. */
private boolean eof;
/** Remember if a line has already been read. */
private boolean lineRead;
/** The maximum number of attributes to read. */
private int maxNumber;
/** The number of lines read. */
private int linesRead = 0;
private String[][] currentData;
/** Array of size [number of attributes][2]. For each attribute i the value of
* dataSourceIndex[i][FILE_NR] is used as an index to {@link #attributeDataTokenizer} and
* the value of dataSourceIndex[i][TOKEN_NR] specifies the index of the token to use for
* attribute i. */
private int[][] dataSourceIndex;
/** Constructs a new FileDataRowReader.
* @param factory Factory used to create data rows.
* @param attributeDataSources List of {@link AttributeDataSource}s.
* @param sampleSize Limit sample to the first sampleSize lines read from files. -1 for no limit.
*/
public FileDataRowReader(DataRowFactory factory,
List attributeDataSources,
int sampleSize,
char[] separators,
char[] commentChars,
char[] ignoreChars) throws IOException {
super(factory);
this.maxNumber = sampleSize;
this.attributes = new Attribute[attributeDataSources.size()];
this.dataSourceIndex = new int[attributeDataSources.size()][2];
List tokenizerList = new LinkedList();
// map all files used to indices
Map fileMap = new HashMap();
Iterator i = attributeDataSources.iterator();
int attribute = 0;
int greatestFileIndex = -1;
while (i.hasNext()) {
AttributeDataSource ads = (AttributeDataSource)i.next();
attributes[attribute] = ads.getAttribute();
File file = ads.getFile();
Integer fileIndex = (Integer)fileMap.get(file);
// new file found? -> create tokenizer and map to index number
if (fileIndex == null) {
fileIndex = new Integer(++greatestFileIndex);
fileMap.put(file, fileIndex);
tokenizerList.add(makeTokenizer(file, separators, commentChars, ignoreChars));
}
dataSourceIndex[attribute][FILE_NR] = fileIndex.intValue();
dataSourceIndex[attribute][TOKEN_NR] = ads.getColumn();
attribute++;
}
// determine maximal token index used
this.attributeDataTokenizer = new StreamTokenizer[tokenizerList.size()];
tokenizerList.toArray(this.attributeDataTokenizer);
int[] maxTokenIndex = new int[this.attributeDataTokenizer.length];
for (attribute = 0; attribute < dataSourceIndex.length; attribute++) {
if (dataSourceIndex[attribute][TOKEN_NR] > maxTokenIndex[dataSourceIndex[attribute][FILE_NR]]) {
maxTokenIndex[dataSourceIndex[attribute][FILE_NR]] = dataSourceIndex[attribute][TOKEN_NR];
}
}
// create temporary string array to store tokens in
currentData = new String[this.attributeDataTokenizer.length][];
for (int t = 0; t < maxTokenIndex.length; t++) {
currentData[t] = new String[maxTokenIndex[t]+1];
}
}
/** Delivers a <tt>StreamTokenizer</tt> with the default syntax and additionally some syntax enhancements
* done by the user.
*/
public static StreamTokenizer makeTokenizer(File file,
char[] separators,
char[] commentChars,
char[] ignoreChars) throws FileNotFoundException {
StreamTokenizer tokenizer = null;
if (file != null) {
tokenizer = new StreamTokenizer(new FileReader(file));
// resets the syntax of the tokenizer.
tokenizer.resetSyntax();
// token characters
tokenizer.wordChars(128, 255); // schmuh
tokenizer.wordChars('a', 'z'); // a-z
tokenizer.wordChars('A', 'Z'); // A-Z
tokenizer.wordChars('0', '9'); // 0-9
tokenizer.wordChars('+', '+'); // +
tokenizer.wordChars('-', '-'); // -
tokenizer.wordChars('_', '_'); // _
tokenizer.wordChars('.', '.'); // .
tokenizer.wordChars('?', '?'); // ?
tokenizer.wordChars('/', '/'); // /
tokenizer.wordChars('\\', '\\'); // \
// default whitespace characters
tokenizer.whitespaceChars('\u0000', '\u0020'); // whitespace schmuh
// additionally whitespace characters (default is comma)
for (int i = 0; i < separators.length; i++) {
tokenizer.ordinaryChar(separators[i]);
tokenizer.whitespaceChars(separators[i], separators[i]);
}
// ignore characters (none by default)
for (int i = 0; i < ignoreChars.length; i++) {
tokenizer.ordinaryChar(ignoreChars[i]);
tokenizer.whitespaceChars(ignoreChars[i], ignoreChars[i]);
}
// quote characters
tokenizer.quoteChar('"');
tokenizer.quoteChar('\'');
// comment characters
for (int i = 0; i < commentChars.length; i++) {
tokenizer.ordinaryChar(commentChars[i]);
tokenizer.commentChar(commentChars[i]);
}
tokenizer.eolIsSignificant(true); // end of line is significant and can be asked by TT_EOL
}
return tokenizer;
}
public boolean hasNext() {
if ((maxNumber > -1) && (linesRead >= maxNumber)) return false;
if (lineRead) return !eof;
try {
eof = !readLine();
} catch(IOException e) {
LogService.logException("FileDataRowReader.hasNext(): " + e.getMessage(), e);
return false;
}
lineRead = true;
return (!eof);
}
/** Reads a line of data from all tokenizers. Returns true if the line was readable. */
private boolean readLine() throws IOException {
for (int i = 0; i < attributeDataTokenizer.length; i++) {
int column = 0;
boolean eol = false;
while (!eol && (column < this.currentData[i].length)) {
attributeDataTokenizer[i].nextToken();
if (attributeDataTokenizer[i].ttype == attributeDataTokenizer[i].TT_EOF) return false;
if (attributeDataTokenizer[i].ttype == attributeDataTokenizer[i].TT_EOL) {
if (column != 0) {
eol = true;
}
} else {
this.currentData[i][column++] = attributeDataTokenizer[i].sval;
}
}
if (!eol) {
while (attributeDataTokenizer[i].ttype != attributeDataTokenizer[i].TT_EOL) {
if (attributeDataTokenizer[i].ttype == attributeDataTokenizer[i].TT_EOF)
break; // necessary if the data files does not end with newline
else
attributeDataTokenizer[i].nextToken();
}
}
// specified a column which does not exist!
if (column != currentData[i].length)
throw new IOException("Column does not exist! Please check your attribute description file!");
}
return true;
}
/** Returns the next Example. */
public DataRow next(){
if (eof == true) return null;
if (!lineRead)
if (!hasNext())
return null;
String[] data = new String[attributes.length];
for (int i = 0; i < attributes.length; i++) {
if (dataSourceIndex[i][1] == -1) {
data[i] = null;
} else {
data[i] = currentData[dataSourceIndex[i][0]][dataSourceIndex[i][1]];
}
}
DataRow dataRow = getFactory().create(data, attributes);
linesRead++;
lineRead = false;
return dataRow;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -