📄 datasetreaderint.java
字号:
/** * JBNC - Bayesian Network Classifiers Toolbox <p> * * Latest release available at http://sourceforge.net/projects/jbnc/ <p> * * Copyright (C) 1999-2003 Jarek Sacha <p> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. <p> * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. <p> * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307, USA. <br> * http://www.fsf.org/licenses/gpl.txt */package jbnc.dataset;import java.io.BufferedReader;import java.io.FileReader;import java.util.TreeSet;import java.util.Vector;/** * Functions for reading data sets with test cases. Variables are assumed to be * 'descrete' or 'ignore'. * * @author Jarek Sacha * @since June 1, 1999 * @see jbnc.dataset.Dataset * @see jbnc.dataset.NamesReader */public final class DatasetReaderInt extends DatasetReader { /** * Reads a data file with cases - comma delimited, no header. This function * is typically used to read files in c4.5 format, description of attributes * needs to be read from the '.names' first using * jbnc.dataset.NamesReader.open(). * * @param names Descriptions of columns in the file. * @param fileName Description of Parameter * @return Vector of vectors representing cases. Each case * attribute is allocated in type defined by 'names' parameters. * @exception Exception Description of Exception */ public Vector open(String fileName, AttributeSpecs[] names) throws Exception { int lineCount = 0; int missingCount = 0; try { // Open file BufferedReader in = new BufferedReader(new FileReader(fileName)); Vector v = new Vector(); String s; // Get cases while ((s = in.readLine()) != null) { ++lineCount; // Read line ParseLine p = new ParseLine(s); Vector raw = p.read(); if (raw == null) { continue; } // Verify and convert int[] l = convertCaseInt(raw, names); if (l != null) { v.add(l); } else { ++missingCount; } } lineCount = -1; in.close(); if (missingCount > 0) { System.out.println("Discarded " + missingCount + " cases with missing values."); } return v; } catch (Exception e) { if (lineCount > 0) { System.out.println("Error in line " + lineCount); System.out.println(e.toString()); } throw e; } } /** * Reads a data file with cases - comma delimited, with header. First line in * the file gives names of attributes (columns). In current implementation * the attributes are assumed to be discrete. * * @param fileName Name of the file to read data from. * @param className Name of the attribute representing class. If it is * null, it is assumed that the last column represents class. * @return Vector of vectors representing cases. Each case * attribute is allocated in type defined by 'names' parameters. * @exception Exception Description of Exception */ public Dataset open(String fileName, String className) throws Exception { int lineCount = 0; try { // Open file BufferedReader in = new BufferedReader(new FileReader(fileName)); // Get attribute names String s; Vector header = null; while ((s = in.readLine()) != null) { ++lineCount; // Read line ParseLine p = new ParseLine(s); header = p.read(); if (header != null) { break; } } if (header == null) { throw new Exception("Data file is empty."); } int headerLine = lineCount; int headerSize = header.size(); int lastCol = headerSize - 1; Vector cases = new Vector(); // Get cases while ((s = in.readLine()) != null) { ++lineCount; // Read line ParseLine p = new ParseLine(s); Vector l = p.read(); if (l == null) { continue; } if (l.size() != header.size()) { throw new Exception("Number of tokens in line #" + lineCount + " does not match number of tokens in header in line#" + headerLine + " (" + l.size() + " != " + header.size() + ")"); } cases.add(l); } lineCount = -1; in.close(); if (className != null) { // Reorder columns so that class attribute is the last one in the row // Find which column is the class in int classCol = -1; for (int col = 0; col < headerSize; ++col) { if (className.equals(header.get(col))) { classCol = col; break; } } if (classCol == -1) { throw new Exception("Cannot find class name '" + className + "' in the header {" + header + "}."); } if (classCol != lastCol) { // Swap class name in the header Object oc = header.get(classCol); header.set(classCol, header.get(lastCol)); header.set(lastCol, oc); // Swap attributes in each row for (int row = 0; row < cases.size(); ++row) { Vector thisCase = (Vector) cases.get(row); Object o = thisCase.get(classCol); thisCase.set(classCol, thisCase.get(lastCol)); thisCase.set(lastCol, o); } } } // Create enumerations of values for each attribute TreeSet[] ss = new TreeSet[headerSize]; for (int col = 0; col < headerSize; ++col) { ss[col] = new TreeSet(); } for (int row = 0; row < cases.size(); ++row) { Vector thisCase = (Vector) cases.get(row); for (int col = 0; col < headerSize; ++col) { ss[col].add(thisCase.get(col)); } } // Create attribute descriptions AttributeSpecs[] names = new AttributeSpecs[headerSize]; for (int i = 0; i < headerSize; ++i) { Object[] a = ss[i].toArray(); String[] states = new String[a.length]; for (int j = 0; j < a.length; ++j) { states[j] = (String) a[j]; } names[i] = new AttributeSpecs(); names[i].setType(AttributeType.DISCRETE); names[i].setName((String) header.get(i)); names[i].setStates(states); } DatasetInt dataset = new DatasetInt(); dataset.names = names; dataset.cases = new Vector(); // Create indexed cases for (int row = 0; row < cases.size(); ++row) { int[] r = new int[headerSize]; Vector thisCase = (Vector) cases.get(row); for (int col = 0; col < headerSize; ++col) { int index = names[col].getState((String) thisCase.get(col)); r[col] = index; } dataset.cases.add(r); } return dataset; } catch (Exception e) { if (lineCount > 0) { System.out.println("Error in line " + lineCount); System.out.println(e.toString()); } throw e; } } /** * LOCAL * * @param rawData Description of Parameter * @param names Description of Parameter * @return Description of the Returned Value * @exception Exception Description of Exception */ /** * Convert case from a raw format. * * @param rawData Description of Parameter * @param names Description of Parameter * @return Description of the Returned Value * @exception Exception Description of Exception */ protected int[] convertCaseInt(Vector rawData, AttributeSpecs[] names) throws Exception { // Verify size int size = names.length; if (rawData.size() != size) { throw new Exception("DatasetReaderInt.convertLine: found " + rawData.size() + ", expecting " + size); } int[] v = new int[names.length]; for (int i = 0; i < size; ++i) { AttributeSpecs a = names[i]; String s = (String) rawData.get(i); if (getDiscardIncompleteCases() && s.equals("?")) { return null; } try { AttributeType type = a.getType(); if (type == AttributeType.IGNORE) { v[i] = -1; } else if (type == AttributeType.CONTINUOUS) { throw new Exception("DatasetReaderInt.convertLine: internal error: " + "Discreate dataset can not have values of type " + "CONTINUOUS."); } else if (type == AttributeType.DISCRETE) { v[i] = a.getState(s); } else { throw new Exception("DatasetReaderInt.convertLine: internal error: " + "incorrect attribute code"); } } catch (NumberFormatException e) { throw new Exception("DatasetReaderInt.convertLine: cannot convert token #" + (i + 1) + " to a number (" + e.getMessage() + ")"); } catch (AttributeSpecs.AttributeException e) { throw new Exception("DatasetReaderInt.convertLine: token #" + (i + 1) + " discrete attribute declaration. Have '" + s + "', expecting one of '" + a.toString() + "'"); } } return v; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -