📄 arffloader.java
字号:
/** * Reads a single instance using the tokenizer and returns it. * * @param structure the dataset header information, will get updated * in case of string or relational attributes * @param flag if method should test for carriage return after * each instance * @return null if end of file has been reached * @throws IOException if the information is not read * successfully */ public Instance readInstance(Instances structure, boolean flag) throws IOException { return getInstance(structure, flag); } /** * Reads a single instance using the tokenizer and returns it. * * @param structure the dataset header information, will get updated * in case of string or relational attributes * @param flag if method should test for carriage return after * each instance * @return null if end of file has been reached * @throws IOException if the information is not read * successfully */ protected Instance getInstance(Instances structure, boolean flag) throws IOException { m_Data = structure; // Check if any attributes have been declared. if (m_Data.numAttributes() == 0) { errorMessage("no header information available"); } // Check if end of file reached. getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { return null; } // Parse instance if (m_Tokenizer.ttype == '{') { return getInstanceSparse(flag); } else { return getInstanceFull(flag); } } /** * Reads a single instance using the tokenizer and returns it. * * @param flag if method should test for carriage return after * each instance * @return null if end of file has been reached * @throws IOException if the information is not read * successfully */ protected Instance getInstanceSparse(boolean flag) throws IOException { int valIndex, numValues = 0, maxIndex = -1; // Get values do { // Get index getIndex(); if (m_Tokenizer.ttype == '}') { break; } // Is index valid? try{ m_IndicesBuffer[numValues] = Integer.valueOf(m_Tokenizer.sval).intValue(); } catch (NumberFormatException e) { errorMessage("index number expected"); } if (m_IndicesBuffer[numValues] <= maxIndex) { errorMessage("indices have to be ordered"); } if ((m_IndicesBuffer[numValues] < 0) || (m_IndicesBuffer[numValues] >= m_Data.numAttributes())) { errorMessage("index out of bounds"); } maxIndex = m_IndicesBuffer[numValues]; // Get value; getNextToken(); // Check if value is missing. if (m_Tokenizer.ttype == '?') { m_ValueBuffer[numValues] = Instance.missingValue(); } else { // Check if token is valid. if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) { errorMessage("not a valid value"); } switch (m_Data.attribute(m_IndicesBuffer[numValues]).type()) { case Attribute.NOMINAL: // Check if value appears in header. valIndex = m_Data.attribute(m_IndicesBuffer[numValues]).indexOfValue(m_Tokenizer.sval); if (valIndex == -1) { errorMessage("nominal value not declared in header"); } m_ValueBuffer[numValues] = (double)valIndex; break; case Attribute.NUMERIC: // Check if value is really a number. try{ m_ValueBuffer[numValues] = Double.valueOf(m_Tokenizer.sval). doubleValue(); } catch (NumberFormatException e) { errorMessage("number expected"); } break; case Attribute.STRING: m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).addStringValue(m_Tokenizer.sval); break; case Attribute.DATE: try { m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).parseDate(m_Tokenizer.sval); } catch (ParseException e) { errorMessage("unparseable date: " + m_Tokenizer.sval); } break; case Attribute.RELATIONAL: try { ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(m_IndicesBuffer[numValues]).relation(), 0); Instances data = arff.getData(); m_ValueBuffer[numValues] = m_Data.attribute(m_IndicesBuffer[numValues]).addRelation(data); } catch (Exception e) { throw new IOException(e.toString() + " of line " + getLineNo()); } break; default: errorMessage("unknown attribute type in column " + m_IndicesBuffer[numValues]); } } numValues++; } while (true); if (flag) { getLastToken(true); } // Add instance to dataset double[] tempValues = new double[numValues]; int[] tempIndices = new int[numValues]; System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues); System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues); Instance inst = new SparseInstance(1, tempValues, tempIndices, m_Data.numAttributes()); inst.setDataset(m_Data); return inst; } /** * Reads a single instance using the tokenizer and returns it. * * @param flag if method should test for carriage return after * each instance * @return null if end of file has been reached * @throws IOException if the information is not read * successfully */ protected Instance getInstanceFull(boolean flag) throws IOException { double[] instance = new double[m_Data.numAttributes()]; int index; // Get values for all attributes. for (int i = 0; i < m_Data.numAttributes(); i++){ // Get next token if (i > 0) { getNextToken(); } // Check if value is missing. if (m_Tokenizer.ttype == '?') { instance[i] = Instance.missingValue(); } else { // Check if token is valid. if (m_Tokenizer.ttype != StreamTokenizer.TT_WORD) { errorMessage("not a valid value"); } switch (m_Data.attribute(i).type()) { case Attribute.NOMINAL: // Check if value appears in header. index = m_Data.attribute(i).indexOfValue(m_Tokenizer.sval); if (index == -1) { errorMessage("nominal value not declared in header"); } instance[i] = (double)index; break; case Attribute.NUMERIC: // Check if value is really a number. try{ instance[i] = Double.valueOf(m_Tokenizer.sval). doubleValue(); } catch (NumberFormatException e) { errorMessage("number expected"); } break; case Attribute.STRING: instance[i] = m_Data.attribute(i).addStringValue(m_Tokenizer.sval); break; case Attribute.DATE: try { instance[i] = m_Data.attribute(i).parseDate(m_Tokenizer.sval); } catch (ParseException e) { errorMessage("unparseable date: " + m_Tokenizer.sval); } break; case Attribute.RELATIONAL: try { ArffReader arff = new ArffReader(new StringReader(m_Tokenizer.sval), m_Data.attribute(i).relation(), 0); Instances data = arff.getData(); instance[i] = m_Data.attribute(i).addRelation(data); } catch (Exception e) { throw new IOException(e.toString() + " of line " + getLineNo()); } break; default: errorMessage("unknown attribute type in column " + i); } } } if (flag) { getLastToken(true); } // Add instance to dataset Instance inst = new Instance(1, instance); inst.setDataset(m_Data); return inst; } /** * Reads and stores header of an ARFF file. * * @param capacity the number of instances to reserve in the data * structure * @throws IOException if the information is not read * successfully */ protected void readHeader(int capacity) throws IOException { m_Lines = 0; String relationName = ""; // Get name of relation. getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } if (Instances.ARFF_RELATION.equalsIgnoreCase(m_Tokenizer.sval)) { getNextToken(); relationName = m_Tokenizer.sval; getLastToken(false); } else { errorMessage("keyword " + Instances.ARFF_RELATION + " expected"); } // Create vectors to hold information temporarily. FastVector attributes = new FastVector(); // Get attribute declarations. getFirstToken(); if (m_Tokenizer.ttype == StreamTokenizer.TT_EOF) { errorMessage("premature end of file"); } while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(m_Tokenizer.sval)) { attributes = parseAttribute(attributes); } // Check if data part follows. We can't easily check for EOL. if (!Instances.ARFF_DATA.equalsIgnoreCase(m_Tokenizer.sval)) { errorMessage("keyword " + Instances.ARFF_DATA + " expected"); } // Check if any attributes have been declared. if (attributes.size() == 0) { errorMessage("no attributes declared"); } m_Data = new Instances(relationName, attributes, capacity); } /** * Parses the attribute declaration. * * @param attributes the current attributes vector * @return the new attributes vector * @throws IOException if the information is not read * successfully */ protected FastVector parseAttribute(FastVector attributes) throws IOException { String attributeName; FastVector attributeValues; // Get attribute name. getNextToken(); attributeName = m_Tokenizer.sval; getNextToken(); // Check if attribute is nominal. if (m_Tokenizer.ttype == StreamTokenizer.TT_WORD) { // Attribute is real, integer, or string. if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) || m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) || m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) { attributes.addElement(new Attribute(attributeName, attributes.size())); readTillEOL(); } else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) { attributes. addElement(new Attribute(attributeName, (FastVector)null, attributes.size())); readTillEOL(); } else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) { String format = null; if (m_Tokenizer.nextToken() != StreamTokenizer.TT_EOL) { if ((m_Tokenizer.ttype != StreamTokenizer.TT_WORD) && (m_Tokenizer.ttype != '\'') && (m_Tokenizer.ttype != '\"')) { errorMessage("not a valid date format"); } format = m_Tokenizer.sval; readTillEOL(); } else { m_Tokenizer.pushBack(); } attributes.addElement(new Attribute(attributeName, format, attributes.size())); } else if (m_Tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_RELATIONAL)) { readTillEOL(); // Read attributes for subrelation // First, save current set of attributes
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -