📄 sparseformatdatarowreader.java
字号:
/* * YALE - Yet Another Learning Environment * Copyright (C) 2002, 2003 * Simon Fischer, Ralf Klinkenberg, Ingo Mierswa, * Katharina Morik, Oliver Ritthoff * Artificial Intelligence Unit * Computer Science Department * University of Dortmund * 44221 Dortmund, Germany * email: yale@ls8.cs.uni-dortmund.de * web: http://yale.cs.uni-dortmund.de/ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */package edu.udo.cs.yale.example;import edu.udo.cs.yale.tools.LogService;import edu.udo.cs.yale.tools.Ontology;import edu.udo.cs.yale.tools.att.AttributeSet;import java.util.List;import java.util.LinkedList;import java.util.Iterator;import java.io.IOException;import java.io.Reader;import java.io.BufferedReader;import java.util.StringTokenizer;import java.util.Map;import java.util.HashMap;import java.util.Vector;/** Reads the data rows in sparse format. The format is specified in the class comment of * {@link edu.udo.cs.yale.operator.SparseFormatExampleSource}. {@link Attribute}s may be passed to the reader * in its constructor. If they are ommitted, they are generated on the fly. * In either case, indices are assigned to the attributes. If an {@link ExampleTable} * is generated using instances of this class, the constructor of {@link ExampleTable} * will reassign these indexes. * * @author Simon, Ingo * @version $Id: SparseFormatDataRowReader.java,v 2.13 2003/09/10 13:02:06 fischer Exp $ */public class SparseFormatDataRowReader extends AbstractDataRowReader { /** Names of the formats. */ public static final String[] FORMAT_NAMES = { "xy", "yx", "prefix", "separate_file", "no_label" }; /** Label succeeds attributes. */ public static final int FORMAT_XY = 0; /** Label preceeds attributes. */ public static final int FORMAT_YX = 1; /** Label has a prefix specified in the prefix map. */ public static final int FORMAT_PREFIX = 2; /** Label is in separate file. */ public static final int FORMAT_SEPARATE_FILE = 3; /** Label is missing. */ public static final int FORMAT_NO_LABEL = 4; /** Reader for the labels. */ private BufferedReader inAttributes, inLabels; /** The attribute set with regular and special attributes. */ private AttributeSet attributeSet = null; /** Remember if an end of file has occured. */ private boolean eof; /** Remember if a line has already been read. */ private boolean lineRead; /** The maximum number of attributes to read. */ private int maxNumber; /** Number of lines already read. */ private int linesRead; /** The DataRow that will be returned in the next call to {@link #next()} */ private DataRow currentDataRow; /** One out of FORMAT_XY, FORMAT_YX, FORMAT_PREFIX, FORMAT_SEPARATE_FILE, and FORMAT_NO_LABEL. */ private int format; /** The dimension of the examples, i.e. the total number of regular and special attributes. */ private int dimension; /** Maps prefixes to special attribute names, e.g. "l:" to "label". */ private Map prefixMap = new HashMap(); /** Creates a new data row reader for sparse format. The attributes indices * must not be set. If they are, they are reassigned new values when this constructor * is called! * * @param factory Factory used to create {@link DataRow} instances. * @param format One Out of FORMAT_XY, FORMAT_YX, FORMAT_PREFIX, and FORMAT_SEPARATE_FILE. * @param prefixMap Maps prefixes to special attribute names (e.g. "l" to "label"). * @param attributeSet Set of regular and special attributes. * @param attributeReader Reader for the data * @param labelReader Reader for the labels. Only necessary if format is FORMAT_SEPARATE_FILE. * @param sampleSize sample size, may be -1 for no limit. **/ public SparseFormatDataRowReader(DataRowFactory factory, int format, Map prefixMap, AttributeSet attributeSet, Reader attributeReader, Reader labelReader, int sampleSize) { super(factory); this.format = format; this.prefixMap = prefixMap; this.attributeSet = attributeSet; if (attributeSet == null) { throw new IllegalArgumentException("AttributeSet must not be null."); } this.dimension = attributeSet.getAllAttributes().size(); this.maxNumber = sampleSize; this.inAttributes = new BufferedReader(attributeReader); if (format == FORMAT_SEPARATE_FILE) { if (labelReader == null) throw new IllegalArgumentException("labelReader must not be null if format is 'separate_file'!"); this.inLabels = new BufferedReader(labelReader); } if (format != FORMAT_NO_LABEL) { if (attributeSet.getSpecialAttribute("label") == null) { throw new IllegalArgumentException("If format is not no_label, label attribute must be defined."); } } } /** Checks if futher examples exist. Returns false if one of the files end. */ public boolean hasNext(){ if ((maxNumber > -1) && (linesRead >= maxNumber)) return false; if (lineRead) return !eof; try { eof = !readLine(); if (eof) { inAttributes.close(); if (inLabels != null) inLabels.close(); } } catch(IOException e) { throw new RuntimeException(e.getMessage(), e); } lineRead = true; return (!eof); } private boolean readLine() throws IOException { String attributeLine = null; do { attributeLine = inAttributes.readLine(); if (attributeLine == null) return false; } while (attributeLine.startsWith("#") || (attributeLine.length() == 0)); this.currentDataRow = getFactory().create(dimension); StringTokenizer tokenizer = new StringTokenizer(attributeLine); String labelString = null; if (format == FORMAT_YX) { labelString = tokenizer.nextToken(); } else if (format == FORMAT_SEPARATE_FILE) { do { labelString = inLabels.readLine(); if (labelString == null) return false; } while (labelString.startsWith("#") || (labelString.length() == 0)); } while (tokenizer.hasMoreTokens()) { String attributeToken = tokenizer.nextToken(); int colonIndex = attributeToken.indexOf(':'); if ((format == FORMAT_XY) && (colonIndex == -1)) { if (labelString != null) { throw new IOException("Malformed line in examplefile: " + attributeToken); } else { labelString = attributeToken; } } else { String pos = attributeToken.substring(0, colonIndex);// references the attribute String value = attributeToken.substring(colonIndex+1); // the attribute value Attribute attribute = null; // the referenced attribute try { int index = Integer.parseInt(pos)-1; if ((index < 0) || (index >= attributeSet.getNumberOfRegularAttributes())) throw new IOException("Attribute index out of range: '"+(index + 1)+ "'! Index must be between 1 and dimension " + attributeSet.getNumberOfRegularAttributes()+"!"); attribute = attributeSet.getAttribute(index); } catch (NumberFormatException e) { String specialAttributeName = (String)prefixMap.get(pos); if (specialAttributeName == null) { attribute = attributeSet.getSpecialAttribute(pos); if (attribute == null) throw new IOException("Illegal attribute index: '"+pos+ "' (legal values are integers and defined prefixes for special attributes (Parameter prefix_map of SparseFormatExampleSource))!"); } else { attribute = attributeSet.getSpecialAttribute(specialAttributeName); } if (attribute == null) throw new IOException("Unknown special attribute: " + specialAttributeName); } if (attribute != null) { if (attribute.isNominal()) { currentDataRow.set(attribute, attribute.mapString(value)); } else { try { currentDataRow.set(attribute, Double.parseDouble(value)); } catch (NumberFormatException e) { throw new IOException("Attribute is not numerical: '"+value+"'!"); } } } } } if (labelString != null) { Attribute label = attributeSet.getSpecialAttribute("label"); if (label.isNominal()) { currentDataRow.set(label, label.mapString(labelString)); } else { try { currentDataRow.set(label, Double.parseDouble(labelString)); } catch (NumberFormatException e) { throw new IOException("Label is not numerical: '"+labelString+"'."); } } } return true; } /** Returns the next Example. */ public DataRow next(){ if (eof == true) return null; if (!lineRead) if (!hasNext()) return null; linesRead++; lineRead = false; return currentDataRow; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -