📄 csvbuilder.java
字号:
//// OpenForecast - open source, general-purpose forecasting package.// Copyright (C) 2002-2004 Steven R. Gould//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package net.sourceforge.openforecast.input;import java.io.BufferedReader;import java.io.File;import java.io.FileReader;import java.io.FileNotFoundException;import java.io.IOException;import java.io.StreamTokenizer;import java.io.StringReader;import java.util.ArrayList;import net.sourceforge.openforecast.DataPoint;import net.sourceforge.openforecast.DataSet;import net.sourceforge.openforecast.Observation;/** * Defines a Builder that can be used to construct a DataSet from CSV (Comma * Separated Values) file or input stream. This class makes for a quick and * easy "import" of data from a variety of other applications such as * Microsoft Excel. * * <p>The last value on each row is assumed to represent the dependent * variable. For example, if the independent variables are represented by x1, * x2, x3 and so on, and the dependent variable is represented by y, then a * row should be of the form: * * <pre> * x1, x2, ..., xi, y * </pre> * * <p>For example, the following represents data points (1,3), (2,5), (3,6), * and (4,7): * * <pre> * 1, 3 * 2, 5 * 3, 6 * 4, 7 * </pre> * * <p>where the values 3, 5, 6 and 7 are the observed values of the dependent * variable corresponding to the associated values of the independent variables * with the values 1, 2, 3, and 4 respectively. By default, the independent * variables - just one in this example - would be named "x1", ..., * "x<em>i</em>", etc. To override this behavior, you can specify a "header * row" containing names for the individual variables (the "columns" of data). * * <p>Using the previous example, if the x<sub>i</sub> represented time, we * could define the data input as follows: * * <pre> * time, observation * 1, 3 * 2, 5 * 3, 6 * 4, 7 * </pre> * * <p>This would name the dependent variable in this case, "time", instead of * the default, "x1". * @author Steven R. Gould * @since 0.4 */public class CSVBuilder extends AbstractBuilder{ /** * Constant defining the character used to separate values. */ private final static char SEPARATOR = ','; /** * Set to true only if the first line/row of the current input source is * to be treated as a header row. */ private boolean hasHeaderRow = false; /** * Stores the file reader from which data is to be read by the build * method. */ private FileReader fileReader; /** * Constructs a new CSVBuilder that reads its input from the named file. * The fields will be named "x1", "x2", "x3", etc. * @param filename the name of the CSV file to read the input from. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( String filename ) throws FileNotFoundException { this( new FileReader(filename) ); } /** * Constructs a new CSVBuilder that reads its input from the named file, * and treats the first row of data as a header row containing field names. * @param filename the name of the CSV file to read the input from. * @param hasHeaderRow set to true if the CSV file has a header row. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( String filename, boolean hasHeaderRow ) throws FileNotFoundException { this( new FileReader(filename), hasHeaderRow ); } /** * Constructs a new CSVBuilder that reads its input from the named file. * The fields will be named "x1", "x2", "x3", etc. * @param file the File object specifying the CSV file to read the input * from. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( File file ) throws FileNotFoundException { this( new FileReader(file) ); } /** * Constructs a new CSVBuilder that reads its input from the named file. * The fields will be named "x1", "x2", "x3", etc. * @param file the File object specifying the CSV file to read the input * from. * @param hasHeaderRow set to true if the CSV file has a header row. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( File file, boolean hasHeaderRow ) throws FileNotFoundException { this( new FileReader(file), hasHeaderRow ); } /** * Constructs a new CSVBuilder that reads its input from the named file * input stream. The fields will be named "x1", "x2", "x3", etc. * @param reader the FileReader object specifying the CSV file reader to * read the input from. */ public CSVBuilder( FileReader reader ) { this( reader, false ); } /** * Constructs a new CSVBuilder that reads its input from the named file * input stream. The fields will be named "x1", "x2", "x3", etc. * @param reader the FileReader object specifying the CSV file reader to * read the input from. * @param hasHeaderRow set to true if the CSV file input stream has a * header row. */ public CSVBuilder( FileReader reader, boolean hasHeaderRow ) { this.fileReader = reader; this.hasHeaderRow = hasHeaderRow; } /** * Retrieves a DataSet - a collection of DataPoints - from the current * input source. The DataSet should contain all DataPoints defined by * the input source. * * <p>In general, build will attempt to convert all lines/rows in the CSV * input to data points. The exceptions are as follows: * <ul> * <li>Blank lines (lines containing only whitespace) will be ignored, * and can be used for spacing in the input.</li> * <li>Lines beginning with a '#' will be treated as comments, and will * be ignored.</li> * <li>If a header row is included - as specified in one of the * constructors - then it will be treated as containing field/variable * names for use by the DataSet.</li> * </ul> * @return a DataSet built from the current input source. * @throws IOException if an error occurred reading from the CSV file. */ public DataSet build() throws IOException { DataSet dataSet = new DataSet(); boolean firstLineRead = false; BufferedReader reader = new BufferedReader( fileReader ); String line; do { // Get next line (trimmed) line = reader.readLine(); if ( line == null ) continue; line = line.trim(); // Skip blank lines if ( line.length() == 0 ) continue; // Skip comment lines if ( line.startsWith( "#" ) ) continue; if ( !firstLineRead ) { firstLineRead = true; if ( hasHeaderRow ) { // Treat first line as header readHeaderRow( line ); continue; } // Calculate how many independent values per line // TODO: Fix this to handle quoted commas int n = 0; for ( int pos=0; (pos=line.indexOf(SEPARATOR,pos)) > 0; pos++ ) n++; setNumberOfVariables( n ); } DataPoint dp = build( line ); dataSet.add( dp ); } while ( line != null ); // line == null when EOF is reached return dataSet; } /** * Parses the given line to extract the variable names. * @param line a String representing the line to parse for variable names. */ private void readHeaderRow( String line ) throws IOException { // Temporary store for the variable names ArrayList vars = new ArrayList(); int pos = 0; while ( pos < line.length() ) { // Get position of next quote int nextQuote = line.indexOf("\"", pos); // Get position of next separator int nextSeparator = line.indexOf(SEPARATOR, pos); // if no next separator, then we're done // since we ignore the name of the independent variable if ( nextSeparator < 0 ) break; if ( nextQuote < 0 || nextQuote > nextSeparator ) { // Treat chars from pos to next separator as a label String name = line.substring(pos, nextSeparator); vars.add( name ); pos = nextSeparator+1; continue; } // Handle quoted strings int secondQuote = line.indexOf("\"",nextQuote+1); String name = line.substring(nextQuote+1,secondQuote); vars.add( name ); // We actually ignore any chars outside of quotes, yet // before the next separator pos = line.indexOf(SEPARATOR,secondQuote)+1; } // Add variable names extracted to this Builder's variable names int n = vars.size(); for ( int i=0; i<n; i++ ) addVariable( ((String)vars.get(i)).trim() ); } /** * Builds a DataPoint from the given CSV line. This method should only be * used to parse a line that is expected to be made up of numeric data * only. Use {@link #readHeaderRow} to read a header row if one is expected. * @param line the input line of comma separated values to parse and use * to construct a new DataPoint. * @return a DataPoint object with values as specified by the given input * String. */ private DataPoint build( String line ) throws IOException { Observation dataPoint = new Observation( 0.0 ); StreamTokenizer tokenizer = new StreamTokenizer( new StringReader( line ) ); tokenizer.commentChar( '#' ); tokenizer.eolIsSignificant( true ); tokenizer.parseNumbers(); int i = 0; int n = getNumberOfVariables(); int lastToken = SEPARATOR; do { int token = tokenizer.nextToken(); switch ( tokenizer.ttype ) { case '\t': case ' ': // Skip whitespace continue; case SEPARATOR: // Check for two adjacent commas if ( lastToken != SEPARATOR ) break; // Two adjacent commas. Assume 0.0 between them tokenizer.nval = 0.0; // Fall through, and handle as a number case StreamTokenizer.TT_NUMBER: // Handle numbers appropriately as data // If this is the last value on the line, treat it // as the dependent variable value if ( i == n ) dataPoint.setDependentValue(tokenizer.nval); else dataPoint.setIndependentValue(getVariableName(i), tokenizer.nval); i++; break; case StreamTokenizer.TT_WORD: throw new IOException( "Invalid input in CSV file. Number expected, found '"+tokenizer.sval+"'"); case StreamTokenizer.TT_EOL: case StreamTokenizer.TT_EOF: break; default: } lastToken = tokenizer.ttype; } while ( tokenizer.ttype != StreamTokenizer.TT_EOF ); return dataPoint; }}// Local Variables:// tab-width: 4// End;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -