📄 instances.java
字号:
result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5)); result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5)); result.append(Utils.padLeft("Missing", 12)); result.append(Utils.padLeft("Unique", 12)); result.append(Utils.padLeft("Dist", 6)).append('\n'); for (int i = 0; i < numAttributes(); i++) { Attribute a = attribute(i); AttributeStats as = attributeStats(i); result.append(Utils.padLeft("" + (i + 1), 4)).append(' '); result.append(Utils.padRight(a.name(), 25)).append(' '); long percent; switch (a.type()) { case Attribute.NOMINAL: result.append(Utils.padLeft("Nom", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.NUMERIC: result.append(Utils.padLeft("Num", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.DATE: result.append(Utils.padLeft("Dat", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.STRING: result.append(Utils.padLeft("Str", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; default: result.append(Utils.padLeft("???", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; } result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /"); percent = Math.round(100.0 * as.missingCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /"); percent = Math.round(100.0 * as.uniqueCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' '); result.append('\n'); } return result.toString(); } /** * Reads a single instance using the tokenizer and appends it * to the dataset. Automatically expands the dataset if it * is not large enough to hold the instance. * * @param tokenizer the tokenizer to be used * @param flag if method should test for carriage return after * each instance * @return false if end of file has been reached * @exception IOException if the information is not read * successfully */ protected boolean getInstance(StreamTokenizer tokenizer, boolean flag) throws IOException { // Check if any attributes have been declared. if (m_Attributes.size() == 0) { errms(tokenizer,"no header information available"); } // Check if end of file reached. getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) { return false; } // Parse instance if (tokenizer.ttype == '{') { return getInstanceSparse(tokenizer, flag); } else { return getInstanceFull(tokenizer, flag); } } /** * Reads a single instance using the tokenizer and appends it * to the dataset. Automatically expands the dataset if it * is not large enough to hold the instance. * * @param tokenizer the tokenizer to be used * @param flag if method should test for carriage return after * each instance * @return false if end of file has been reached * @exception IOException if the information is not read * successfully */ protected boolean getInstanceSparse(StreamTokenizer tokenizer, boolean flag) throws IOException { int valIndex, numValues = 0, maxIndex = -1; // Get values do { // Get index getIndex(tokenizer); if (tokenizer.ttype == '}') { break; } // Is index valid? try{ m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue(); } catch (NumberFormatException e) { errms(tokenizer,"index number expected"); } if (m_IndicesBuffer[numValues] <= maxIndex) { errms(tokenizer,"indices have to be ordered"); } if ((m_IndicesBuffer[numValues] < 0) || (m_IndicesBuffer[numValues] >= numAttributes())) { errms(tokenizer,"index out of bounds"); } maxIndex = m_IndicesBuffer[numValues]; // Get value; getNextToken(tokenizer); // Check if value is missing. if (tokenizer.ttype == '?') { m_ValueBuffer[numValues] = Instance.missingValue(); } else { // Check if token is valid. if (tokenizer.ttype != StreamTokenizer.TT_WORD) { errms(tokenizer,"not a valid value"); } switch (attribute(m_IndicesBuffer[numValues]).type()) { case Attribute.NOMINAL: // Check if value appears in header. valIndex = attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval); if (valIndex == -1) { errms(tokenizer,"nominal value not declared in header"); } m_ValueBuffer[numValues] = (double)valIndex; break; case Attribute.NUMERIC: // Check if value is really a number. try{ m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval). doubleValue(); } catch (NumberFormatException e) { errms(tokenizer,"number expected"); } break; case Attribute.STRING: m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval); break; case Attribute.DATE: try { m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).parseDate(tokenizer.sval); } catch (ParseException e) { errms(tokenizer,"unparseable date: " + tokenizer.sval); } break; default: errms(tokenizer,"unknown attribute type in column " + m_IndicesBuffer[numValues]); } } numValues++; } while (true); if (flag) { getLastToken(tokenizer,true); } // Add instance to dataset double[] tempValues = new double[numValues]; int[] tempIndices = new int[numValues]; System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues); System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues); add(new SparseInstance(1, tempValues, tempIndices, numAttributes())); return true; } /** * Reads a single instance using the tokenizer and appends it * to the dataset. Automatically expands the dataset if it * is not large enough to hold the instance. * * @param tokenizer the tokenizer to be used * @param flag if method should test for carriage return after * each instance * @return false if end of file has been reached * @exception IOException if the information is not read * successfully */ protected boolean getInstanceFull(StreamTokenizer tokenizer, boolean flag) throws IOException { double[] instance = new double[numAttributes()]; int index; // Get values for all attributes. for (int i = 0; i < numAttributes(); i++){ // Get next token if (i > 0) { getNextToken(tokenizer); } // Check if value is missing. if (tokenizer.ttype == '?') { instance[i] = Instance.missingValue(); } else { // Check if token is valid. if (tokenizer.ttype != StreamTokenizer.TT_WORD) { errms(tokenizer,"not a valid value"); } switch (attribute(i).type()) { case Attribute.NOMINAL: // Check if value appears in header. index = attribute(i).indexOfValue(tokenizer.sval); if (index == -1) { errms(tokenizer,"nominal value not declared in header"); } instance[i] = (double)index; break; case Attribute.NUMERIC: // Check if value is really a number. try{ instance[i] = Double.valueOf(tokenizer.sval). doubleValue(); } catch (NumberFormatException e) { errms(tokenizer,"number expected"); } break; case Attribute.STRING: instance[i] = attribute(i).addStringValue(tokenizer.sval); break; case Attribute.DATE: try { instance[i] = attribute(i).parseDate(tokenizer.sval); } catch (ParseException e) { errms(tokenizer,"unparseable date: " + tokenizer.sval); } break; default: errms(tokenizer,"unknown attribute type in column " + i); } } } if (flag) { getLastToken(tokenizer,true); } // Add instance to dataset add(new Instance(1, instance)); return true; } /** * Reads and stores header of an ARFF file. * * @param tokenizer the stream tokenizer * @exception IOException if the information is not read * successfully */ protected void readHeader(StreamTokenizer tokenizer) throws IOException { String attributeName; FastVector attributeValues; int i; // Get name of relation. getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) { errms(tokenizer,"premature end of file"); } if (tokenizer.sval.equalsIgnoreCase("@relation")){ getNextToken(tokenizer); m_RelationName = tokenizer.sval; getLastToken(tokenizer,false); } else { errms(tokenizer,"keyword @relation expected"); } // Create vectors to hold information temporarily. m_Attributes = new FastVector(); // Get attribute declarations. getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) { errms(tokenizer,"premature end of file"); } while (tokenizer.sval.equalsIgnoreCase("@attribute")) { // Get attribute name. getNextToken(tokenizer); attributeName = tokenizer.sval; getNextToken(tokenizer); // Check if attribute is nominal. if (tokenizer.ttype == StreamTokenizer.TT_WORD) { // Attribute is real, integer, or string. if (tokenizer.sval.equalsIgnoreCase("real") || tokenizer.sval.equalsIgnoreCase("integer") || tokenizer.sval.equalsIgnoreCase("numeric")) { m_Attributes.addElement(new Attribute(attributeName, numAttributes())); readTillEOL(tokenizer); } else if (tokenizer.sval.equalsIgnoreCase("string")) { m_Attributes. addElement(new Attribute(attributeName, (FastVector)null, numAttributes())); readTillEOL(tokenizer); } else if (tokenizer.sval.equalsIgnoreCase("date")) { String format = null; if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) { if ((tokenizer.ttype != StreamTokenizer.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) { errms(tokenizer,"not a valid date format"); } format = tokenizer.sval; readTillEOL(tokenizer); } else { tokenizer.pushBack(); } m_Attributes.addElement(new Attribute(attributeName, format, numAttributes())); } else { errms(tokenizer,"no valid attribute type or invalid "+ "enumeration"); } } else { // Attribute is nominal. attributeValues = new FastVector(); tokenizer.pushBack(); // Get values for nominal attribute. if (tokenizer.nextToken() != '{') { errms(tokenizer,"{ expected at beginning of enumeration"); } while (tokenizer.nextToken() != '}') { if (tokenizer.ttype == StreamTokenizer.TT_EOL) { errms(tokenizer,"} expected at end of enumeration"); } else { attributeValues.addElement(tokenizer.sval); } } if (attributeValues.size() == 0) { errms(tokenizer,"no nominal values found"); } m_Attributes. addElement(new Attribute(attributeName, attributeValues, numAttributes())); } getLastToken(tokenizer,false); getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) errms(tokenizer,"premature end of file"); } // Check if data part follows. We can't easily check for EOL. if (!tokenizer.sval.equalsIgnoreCase("@data")) { errms(tokenizer,"keyword @data expected"); } // Check if any attributes have been declared. if (m_Attributes.size() == 0) { errms(tokenizer,"no attributes declared"); } // Allocate buffers in case sparse instances have to be read m_ValueBuffer = new double[numAttributes()]; m_IndicesBuffer = new int[numAttributes()]; } /** * Copies instances from one set to the end of another * one. * * @param source the source of the instances * @param from the position of the first instance to be copied * @param dest the destination for the instances * @param num the number of instances to be copied */ private void copyInstances(int from, Instances dest, int num) { for (int i = 0; i < num; i++) { dest.add(instance(from + i)); } } /** * Throws error message with line number and last token read. * * @param theMsg the error message to be thrown * @param tokenizer the stream tokenizer * @throws IOExcpetion containing the error message */ private void errms(StreamTokenizer tokenizer, String theMsg) throws IOException { throw new IOException(theMsg + ", read " + tokenizer.toString()); } /** * Replaces the attribute information by a clone of * itself. */ private void freshAttributeInfo() { m_Attributes = (FastVector) m_Attributes.copyElements(); } /** * Gets next token, skipping empty lines.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -