📄 csvloader.java
字号:
Object cval = current.elementAt(j);
if (cval instanceof String) {
if (((String)cval).compareTo("?") == 0) {
vals[j] = Instance.missingValue();
} else {
if (!dataSet.attribute(j).isNominal()) {
System.err.println("Wrong attribute type!!!");
System.exit(1);
}
// find correct index
Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
int index = ((Integer)lookup.get(cval)).intValue();
vals[j] = (double)index;
}
} else if (dataSet.attribute(j).isNominal()) {
// find correct index
Hashtable lookup = (Hashtable)m_cumulativeStructure.elementAt(j);
int index = ((Integer)lookup.get(cval)).intValue();
vals[j] = (double)index;
} else {
vals[j] = ((Double)cval).doubleValue();
}
}
dataSet.add(new Instance(1.0, vals));
}
m_structure = new Instances(dataSet, 0);
setRetrieval(BATCH);
m_cumulativeStructure = null; // conserve memory
return dataSet;
}
/**
* CSVLoader is unable to process a data set incrementally.
*
* @return never returns without throwing an exception
* @exception IOException always. CSVLoader is unable to process a data
* set incrementally.
*/
public Instance getNextInstance() throws IOException {
throw new IOException("CSVLoader can't read data sets incrementally.");
}
/**
* Attempts to parse a line of the data set.
*
* @param tokenizer the tokenizer
* @return a FastVector containg String and Double objects representing
* the values of the instance.
* @exception IOException if an error occurs
*
* <pre><jml>
* private_normal_behavior
* requires: tokenizer != null;
* ensures: \result != null;
* also
* private_exceptional_behavior
* requires: tokenizer == null
* || (* unsucessful parse *);
* signals: (IOException);
* </jml></pre>
*/
private FastVector getInstance(StreamTokenizer tokenizer)
throws IOException {
FastVector current = new FastVector();
// Check if end of file reached.
ConverterUtils.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
return null;
}
boolean first = true;
boolean wasSep;
while (tokenizer.ttype != StreamTokenizer.TT_EOL &&
tokenizer.ttype != StreamTokenizer.TT_EOF) {
// Get next token
if (!first) {
ConverterUtils.getToken(tokenizer);
}
if (tokenizer.ttype == ',' || tokenizer.ttype == '\t' ||
tokenizer.ttype == StreamTokenizer.TT_EOL) {
current.addElement("?");
wasSep = true;
} else {
wasSep = false;
/* // Check if token is valid.
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
errms(tokenizer,"not a valid value");
}*/
// try to parse as a number
try {
double val = Double.valueOf(tokenizer.sval).doubleValue();
current.addElement(new Double(val));
} catch (NumberFormatException e) {
// otherwise assume its an emerated value
current.addElement(new String(tokenizer.sval.replace(' ','_')));
}
}
if (!wasSep) {
ConverterUtils.getToken(tokenizer);
}
first = false;
}
// check number of values read
if (current.size() != m_structure.numAttributes()) {
ConverterUtils.errms(tokenizer,
"wrong number of values. Read "+current.size()
+", expected "+m_structure.numAttributes());
}
// check for structure update
try {
checkStructure(current);
} catch (Exception ex) {
ex.printStackTrace();
}
return current;
}
/**
* Checks the current instance against what is known about the structure
* of the data set so far. If there is a nominal value for an attribute
* that was beleived to be numeric then all previously seen values for this
* attribute are stored in a Hashtable.
*
* @param current a <code>FastVector</code> value
* @exception Exception if an error occurs
*
* <pre><jml>
* private_normal_behavior
* requires: current != null;
* also
* private_exceptional_behavior
* requires: current == null
* || (* unrecognized object type in current *);
* signals: (Exception);
* </jml></pre>
*/
private void checkStructure(FastVector current) throws Exception {
if (current == null) {
throw new Exception("current shouldn't be null in checkStructure");
}
for (int i = 0; i < current.size(); i++) {
Object ob = current.elementAt(i);
if (ob instanceof String) {
if (((String)ob).compareTo("?") == 0) {
} else {
Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);
if (!tempHash.containsKey(ob)) {
// may have found a nominal value in what was previously thought to
// be a numeric variable.
if (tempHash.size() == 0) {
for (int j = 0; j < m_cumulativeInstances.size(); j++) {
FastVector tempUpdate =
((FastVector)m_cumulativeInstances.elementAt(j));
Object tempO = tempUpdate.elementAt(i);
if (tempO instanceof String) {
// must have been a missing value
} else {
if (!tempHash.containsKey(tempO)) {
tempHash.put(new Double(((Double)tempO).doubleValue()),
new Integer(tempHash.size()));
}
}
}
}
int newIndex = tempHash.size();
tempHash.put(ob, new Integer(newIndex));
}
}
} else if (ob instanceof Double) {
Hashtable tempHash = (Hashtable)m_cumulativeStructure.elementAt(i);
if (tempHash.size() != 0) {
if (!tempHash.containsKey(ob)) {
int newIndex = tempHash.size();
tempHash.put(new Double(((Double)ob).doubleValue()),
new Integer(newIndex));
}
}
} else {
throw new Exception("Wrong object type in checkStructure!");
}
}
}
/**
* Assumes the first line of the file contains the attribute names.
* Assumes all attributes are real (Reading the full data set with
* getDataSet will establish the true structure).
*
* @param tokenizer a <code>StreamTokenizer</code> value
* @exception IOException if an error occurs
*
* <pre><jml>
* private_normal_behavior
* requires: tokenizer != null;
* modifiable: m_structure;
* ensures: m_structure != null;
* also
* private_exceptional_behavior
* requires: tokenizer == null
* || (* unsucessful parse *);
* signals: (IOException);
* </jml></pre>
*/
private void readHeader(StreamTokenizer tokenizer) throws IOException {
FastVector attribNames = new FastVector();
ConverterUtils.getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
ConverterUtils.errms(tokenizer,"premature end of file");
}
while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
attribNames.addElement(new Attribute(tokenizer.sval));
ConverterUtils.getToken(tokenizer);
}
m_structure = new Instances(m_sourceFile.getName(), attribNames, 0);
}
/**
* Initializes the stream tokenizer
*
* @param tokenizer the tokenizer to initialize
*/
private void initTokenizer(StreamTokenizer tokenizer) {
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, (' '-1));
tokenizer.wordChars(' ','\u00FF');
tokenizer.whitespaceChars(',',',');
tokenizer.whitespaceChars('\t','\t');
// tokenizer.ordinaryChar(',');
tokenizer.commentChar('%');
tokenizer.quoteChar('"');
tokenizer.quoteChar('\'');
// tokenizer.ordinaryChar('{');
// tokenizer.ordinaryChar('}');
tokenizer.eolIsSignificant(true);
}
/**
* Main method.
*
* @param args should contain the name of an input file.
*/
public static void main(String [] args) {
if (args.length > 0) {
File inputfile;
inputfile = new File(args[0]);
try {
CSVLoader atf = new CSVLoader();
atf.setSource(inputfile);
System.out.println(atf.getDataSet());
} catch (Exception ex) {
ex.printStackTrace();
}
} else {
System.err.println("Usage:\n\tCSVLoader <file.csv>\n");
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -