📄 instances.java
字号:
/** * Reads a single instance using the tokenizer and appends it * to the dataset. Automatically expands the dataset if it * is not large enough to hold the instance. * * @param tokenizer the tokenizer to be used * @param flag if method should test for carriage return after * each instance * @return false if end of file has been reached * @throws IOException if the information is not read * successfully */ protected boolean getInstanceFull(StreamTokenizer tokenizer, boolean flag) throws IOException { double[] instance = new double[numAttributes()]; int index; // Get values for all attributes. for (int i = 0; i < numAttributes(); i++){ // Get next token if (i > 0) { getNextToken(tokenizer); } // Check if value is missing. if (tokenizer.ttype == '?') { instance[i] = Instance.missingValue(); } else { // Check if token is valid. if (tokenizer.ttype != StreamTokenizer.TT_WORD) { errms(tokenizer,"not a valid value"); } switch (attribute(i).type()) { case Attribute.NOMINAL: // Check if value appears in header. index = attribute(i).indexOfValue(tokenizer.sval); if (index == -1) { errms(tokenizer,"nominal value not declared in header"); } instance[i] = (double)index; break; case Attribute.NUMERIC: // Check if value is really a number. try{ instance[i] = Double.valueOf(tokenizer.sval). doubleValue(); } catch (NumberFormatException e) { errms(tokenizer,"number expected"); } break; case Attribute.STRING: instance[i] = attribute(i).addStringValue(tokenizer.sval); break; case Attribute.DATE: try { instance[i] = attribute(i).parseDate(tokenizer.sval); } catch (ParseException e) { errms(tokenizer,"unparseable date: " + tokenizer.sval); } break; case Attribute.RELATIONAL: StringReader reader = new StringReader(tokenizer.sval); StreamTokenizer innerTokenizer = new StreamTokenizer(reader); initTokenizer(innerTokenizer); Instances data = new Instances(attribute(i).relation(), 100); // Allocate buffers in case sparse instances have to be read data.m_ValueBuffer = new double[data.numAttributes()]; data.m_IndicesBuffer = new int[data.numAttributes()]; while (data.getInstance(innerTokenizer, true)) {}; data.compactify(); instance[i] = attribute(i).addRelation(data); break; default: errms(tokenizer,"unknown attribute type in column " + i); } } } if (flag) { getLastToken(tokenizer,true); } // Add instance to dataset add(new Instance(1, instance)); return true; } /** * Reads and stores header of an ARFF file. * * @param tokenizer the stream tokenizer * @throws IOException if the information is not read * successfully */ protected void readHeader(StreamTokenizer tokenizer) throws IOException { // Get name of relation. getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) { errms(tokenizer,"premature end of file"); } if (ARFF_RELATION.equalsIgnoreCase(tokenizer.sval)) { getNextToken(tokenizer); m_RelationName = tokenizer.sval; getLastToken(tokenizer,false); } else { errms(tokenizer,"keyword " + ARFF_RELATION + " expected"); } // Create vectors to hold information temporarily. m_Attributes = new FastVector(); // Get attribute declarations. getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) { errms(tokenizer,"premature end of file"); } while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) { parseAttribute(tokenizer); } // Check if data part follows. We can't easily check for EOL. if (!ARFF_DATA.equalsIgnoreCase(tokenizer.sval)) { errms(tokenizer,"keyword " + ARFF_DATA + " expected"); } // Check if any attributes have been declared. if (m_Attributes.size() == 0) { errms(tokenizer,"no attributes declared"); } // Allocate buffers in case sparse instances have to be read m_ValueBuffer = new double[numAttributes()]; m_IndicesBuffer = new int[numAttributes()]; } /** * Parses the attribute declaration. * * @param tokenizer the stream tokenizer * @throws IOException if the information is not read * successfully */ protected void parseAttribute(StreamTokenizer tokenizer) throws IOException { String attributeName; FastVector attributeValues; // Get attribute name. getNextToken(tokenizer); attributeName = tokenizer.sval; getNextToken(tokenizer); // Check if attribute is nominal. if (tokenizer.ttype == StreamTokenizer.TT_WORD) { // Attribute is real, integer, or string. if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) || tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) || tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) { m_Attributes.addElement(new Attribute(attributeName, numAttributes())); readTillEOL(tokenizer); } else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) { m_Attributes. addElement(new Attribute(attributeName, (FastVector)null, numAttributes())); readTillEOL(tokenizer); } else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) { String format = null; if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) { if ((tokenizer.ttype != StreamTokenizer.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) { errms(tokenizer,"not a valid date format"); } format = tokenizer.sval; readTillEOL(tokenizer); } else { tokenizer.pushBack(); } m_Attributes.addElement(new Attribute(attributeName, format, numAttributes())); } else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_RELATIONAL)) { readTillEOL(tokenizer); // Read attributes for subrelation // First, save current set of attributes FastVector atts = m_Attributes; m_Attributes = new FastVector(); // Now, read attributes until we hit end of declaration of relational value getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) { errms(tokenizer,"premature end of file"); } do { if (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) { parseAttribute(tokenizer); } else if (Attribute.ARFF_END_SUBRELATION.equalsIgnoreCase(tokenizer.sval)) { getNextToken(tokenizer); if (!attributeName.equalsIgnoreCase(tokenizer.sval)) { errms(tokenizer, "declaration of subrelation " + attributeName + " must be terminated by " + "@end " + attributeName); } break; } else { errms(tokenizer, "declaration of subrelation " + attributeName + " must be terminated by " + "@end " + attributeName); } } while (true); // Make relation and restore original set of attributes Instances relation = new Instances(attributeName, m_Attributes, 0); m_Attributes = atts; m_Attributes.addElement(new Attribute(attributeName, relation, numAttributes())); } else { errms(tokenizer,"no valid attribute type or invalid "+ "enumeration"); } } else { // Attribute is nominal. attributeValues = new FastVector(); tokenizer.pushBack(); // Get values for nominal attribute. if (tokenizer.nextToken() != '{') { errms(tokenizer,"{ expected at beginning of enumeration"); } while (tokenizer.nextToken() != '}') { if (tokenizer.ttype == StreamTokenizer.TT_EOL) { errms(tokenizer,"} expected at end of enumeration"); } else { attributeValues.addElement(tokenizer.sval); } } if (attributeValues.size() == 0) { errms(tokenizer,"no nominal values found"); } m_Attributes. addElement(new Attribute(attributeName, attributeValues, numAttributes())); } getLastToken(tokenizer,false); getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) errms(tokenizer,"premature end of file"); } /** * Copies instances from one set to the end of another * one. * * @param from the position of the first instance to be copied * @param dest the destination for the instances * @param num the number of instances to be copied */ //@ requires 0 <= from && from <= numInstances() - num; //@ requires 0 <= num; protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) { for (int i = 0; i < num; i++) { dest.add(instance(from + i)); } } /** * Throws error message with line number and last token read. * * @param theMsg the error message to be thrown * @param tokenizer the stream tokenizer * @throws IOExcpetion containing the error message */ protected void errms(StreamTokenizer tokenizer, String theMsg) throws IOException { throw new IOException(theMsg + ", read " + tokenizer.toString()); } /** * Replaces the attribute information by a clone of * itself. */ protected void freshAttributeInfo() { m_Attributes = (FastVector) m_Attributes.copyElements(); } /** * Gets next token, skipping empty lines. * * @param tokenizer the stream tokenizer * @throws IOException if reading the next token fails */ protected void getFirstToken(StreamTokenizer tokenizer) throws IOException { while (tokenizer.nextToken() == StreamTokenizer.TT_EOL){}; if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) { tokenizer.ttype = StreamTokenizer.TT_WORD; } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))){ tokenizer.ttype = '?'; } } /** * Gets index, checking for a premature and of line. * * @param tokenizer the stream tokenizer * @throws IOException if it finds a premature end of line */ protected void getIndex(StreamTokenizer tokenizer) throws IOException { if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) { errms(tokenizer,"premature end of line"); } if (tokenizer.ttype == StreamTokenizer.TT_EOF) { errms(tokenizer,"premature end of file"); } } /** * Gets token and checks if its end of line. * * @param tokenizer the stream tokenizer * @throws IOException if it doesn't find an end of line */ protected void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk) throws IOException { if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) && ((tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) { errms(tokenizer,"end of line expected"); } } /** * Gets next token, checking for a premature and of line. * * @param tokenizer the stream tokenizer * @throws IOException if it finds a premature end of line */ protected void getNextToken(StreamTokenizer tokenizer) throws IOException { if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) { errms(tokenizer,"premature end of line"); } if (tokenizer.ttype == StreamTokenizer.TT_EOF) { errms(tokenizer,"premature end of file"); } else if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) { tokenizer.ttype = StreamTokenizer.TT_WORD; } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))){ tokenizer.ttype = '?'; } } /** * Initializes the StreamTokenizer used for reading the ARFF file. * * @param tokenizer the stream tokenizer */ protected void initTokenizer(StreamTokenizer tokenizer){ tokenizer.resetSyntax(); tokenizer.whitespaceChars(0, ' '); tokenizer.wordChars(' '+1,'\u00FF'); tokenizer.whitespaceChars(',',','); tokenizer.commentChar('%'); tokenizer.quoteChar('"'); tokenizer.quoteChar('\''); tokenizer.ordinaryChar('{'); tokenizer.ordinaryChar('}'); tokenizer.eolIsSignificant(true); } /** * Returns string including all instances, their weights and * their indices in the original dataset. * * @return description of instance and its weight as a string */ protected /*@pure@*/ String instancesAndWeights(){ StringBuffer text = new StringBuffer(); for (int i = 0; i < numInstances(); i++) { text.append(instance(i) + " " + instance(i).weight()); if (i < numInstances() - 1) { text.append("\n"); } } return text.toString(); } /** * Partitions the instances around a pivot. Used by quicksort and * kthSmallestValue. * * @param attIndex the attribute's index (index starts with 0) * @param left the first index of the subset (index starts with 0) * @param right the last index of the subset (index starts with 0) * * @return the index of the middle element */ /
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -