📄 sparseformatdatarowreader.java
字号:
/*
* YALE - Yet Another Learning Environment
* Copyright (C) 2001-2004
* Simon Fischer, Ralf Klinkenberg, Ingo Mierswa,
* Katharina Morik, Oliver Ritthoff
* Artificial Intelligence Unit
* Computer Science Department
* University of Dortmund
* 44221 Dortmund, Germany
* email: yale-team@lists.sourceforge.net
* web: http://yale.cs.uni-dortmund.de/
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA.
*/
package edu.udo.cs.yale.example;
import edu.udo.cs.yale.tools.LogService;
import edu.udo.cs.yale.tools.Ontology;
import edu.udo.cs.yale.tools.att.AttributeSet;
import java.util.List;
import java.util.LinkedList;
import java.util.Iterator;
import java.io.IOException;
import java.io.Reader;
import java.io.BufferedReader;
import java.util.StringTokenizer;
import java.util.Map;
import java.util.HashMap;
import java.util.Vector;
/** Reads the data rows in sparse format. The format is specified in the class comment of
* {@link edu.udo.cs.yale.operator.io.SparseFormatExampleSource}. {@link Attribute}s may be passed to the reader
* in its constructor. If they are ommitted, they are generated on the fly.
* In either case, indices are assigned to the attributes. If an {@link ExampleTable}
* is generated using instances of this class, the constructor of {@link ExampleTable}
* will reassign these indexes.
*
* @author Simon, Ingo
* @version $Id: SparseFormatDataRowReader.java,v 2.16 2004/08/27 11:57:32 ingomierswa Exp $
*/
public class SparseFormatDataRowReader extends AbstractDataRowReader {
/** Names of the formats. */
public static final String[] FORMAT_NAMES = { "xy", "yx", "prefix", "separate_file", "no_label" };
/** Label succeeds attributes. */
public static final int FORMAT_XY = 0;
/** Label preceeds attributes. */
public static final int FORMAT_YX = 1;
/** Label has a prefix specified in the prefix map. */
public static final int FORMAT_PREFIX = 2;
/** Label is in separate file. */
public static final int FORMAT_SEPARATE_FILE = 3;
/** Label is missing. */
public static final int FORMAT_NO_LABEL = 4;
/** Reader for the labels. */
private BufferedReader inAttributes, inLabels;
/** The attribute set with regular and special attributes. */
private AttributeSet attributeSet = null;
/** Remember if an end of file has occured. */
private boolean eof;
/** Remember if a line has already been read. */
private boolean lineRead;
/** The maximum number of attributes to read. */
private int maxNumber;
/** Number of lines already read. */
private int linesRead;
/** The DataRow that will be returned in the next call to {@link #next()} */
private DataRow currentDataRow;
/** One out of FORMAT_XY, FORMAT_YX, FORMAT_PREFIX, FORMAT_SEPARATE_FILE, and FORMAT_NO_LABEL. */
private int format;
/** The dimension of the examples, i.e. the total number of regular and special attributes. */
private int dimension;
/** Maps prefixes to special attribute names, e.g. "l:" to "label". */
private Map prefixMap = new HashMap();
/** Creates a new data row reader for sparse format. The attributes indices
* must not be set. If they are, they are reassigned new values when this constructor
* is called!
*
* @param factory Factory used to create {@link DataRow} instances.
* @param format One Out of FORMAT_XY, FORMAT_YX, FORMAT_PREFIX, and FORMAT_SEPARATE_FILE.
* @param prefixMap Maps prefixes to special attribute names (e.g. "l" to "label").
* @param attributeSet Set of regular and special attributes.
* @param attributeReader Reader for the data
* @param labelReader Reader for the labels. Only necessary if format is FORMAT_SEPARATE_FILE.
* @param sampleSize sample size, may be -1 for no limit.
**/
public SparseFormatDataRowReader(DataRowFactory factory,
int format,
Map prefixMap,
AttributeSet attributeSet,
Reader attributeReader,
Reader labelReader,
int sampleSize) {
super(factory);
this.format = format;
this.prefixMap = prefixMap;
this.attributeSet = attributeSet;
if (attributeSet == null) {
throw new IllegalArgumentException("AttributeSet must not be null.");
}
this.dimension = attributeSet.getAllAttributes().size();
this.maxNumber = sampleSize;
this.inAttributes = new BufferedReader(attributeReader);
if (format == FORMAT_SEPARATE_FILE) {
if (labelReader == null)
throw new IllegalArgumentException("labelReader must not be null if format is 'separate_file'!");
this.inLabels = new BufferedReader(labelReader);
}
if (format != FORMAT_NO_LABEL) {
if (attributeSet.getSpecialAttribute("label") == null) {
throw new IllegalArgumentException("If format is not no_label, label attribute must be defined.");
}
}
}
/** Checks if futher examples exist. Returns false if one of the files end. */
public boolean hasNext(){
if ((maxNumber > -1) && (linesRead >= maxNumber)) return false;
if (lineRead) return !eof;
try {
eof = !readLine();
if (eof) {
inAttributes.close();
if (inLabels != null)
inLabels.close();
}
} catch(IOException e) {
throw new RuntimeException(e.getMessage(), e);
}
lineRead = true;
return (!eof);
}
private boolean readLine() throws IOException {
String attributeLine = null;
do {
attributeLine = inAttributes.readLine();
if (attributeLine == null) return false;
} while (attributeLine.startsWith("#") || (attributeLine.length() == 0));
this.currentDataRow = getFactory().create(dimension);
StringTokenizer tokenizer = new StringTokenizer(attributeLine);
String labelString = null;
if (format == FORMAT_YX) {
labelString = tokenizer.nextToken();
} else if (format == FORMAT_SEPARATE_FILE) {
do {
labelString = inLabels.readLine();
if (labelString == null) return false;
} while (labelString.startsWith("#") || (labelString.length() == 0));
}
while (tokenizer.hasMoreTokens()) {
String attributeToken = tokenizer.nextToken();
int colonIndex = attributeToken.indexOf(':');
if ((format == FORMAT_XY) && (colonIndex == -1)) {
if (labelString != null) {
throw new IOException("Malformed line in examplefile: " + attributeToken);
} else {
labelString = attributeToken;
}
} else {
String pos = attributeToken.substring(0, colonIndex);// references the attribute
String value = attributeToken.substring(colonIndex+1); // the attribute value
Attribute attribute = null; // the referenced attribute
try {
int index = Integer.parseInt(pos)-1;
if ((index < 0) || (index >= attributeSet.getNumberOfRegularAttributes()))
throw new IOException("Attribute index out of range: '"+(index + 1)+
"'! Index must be between 1 and dimension " + attributeSet.getNumberOfRegularAttributes()+"!");
attribute = attributeSet.getAttribute(index);
} catch (NumberFormatException e) {
String specialAttributeName = (String)prefixMap.get(pos);
if (specialAttributeName == null) {
attribute = attributeSet.getSpecialAttribute(pos);
if (attribute == null)
throw new IOException("Illegal attribute index: '"+pos+
"' (legal values are integers and defined prefixes for special attributes (Parameter prefix_map of SparseFormatExampleSource))!");
} else {
attribute = attributeSet.getSpecialAttribute(specialAttributeName);
}
if (attribute == null)
throw new IOException("Unknown special attribute: " + specialAttributeName);
}
if (attribute != null) {
if (attribute.isNominal()) {
currentDataRow.set(attribute, attribute.mapString(value));
} else {
try {
currentDataRow.set(attribute, Double.parseDouble(value));
} catch (NumberFormatException e) {
throw new IOException("Attribute is not numerical: '"+value+"'!");
}
}
}
}
}
if (labelString != null) {
Attribute label = attributeSet.getSpecialAttribute("label");
if (label.isNominal()) {
currentDataRow.set(label, label.mapString(labelString));
} else {
try {
currentDataRow.set(label, Double.parseDouble(labelString));
} catch (NumberFormatException e) {
throw new IOException("Label is not numerical: '"+labelString+"'.");
}
}
}
return true;
}
/** Returns the next Example. */
public DataRow next(){
if (eof == true) return null;
if (!lineRead)
if (!hasNext())
return null;
linesRead++;
lineRead = false;
return currentDataRow;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -