📄 instances.java

📁 Java 编写的多种数据挖掘算法包括聚类、分类、预处理等
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
  /**   * Reads a single instance using the tokenizer and appends it   * to the dataset. Automatically expands the dataset if it   * is not large enough to hold the instance.   *   * @param tokenizer the tokenizer to be used   * @param flag if method should test for carriage return after    * each instance   * @return false if end of file has been reached   * @throws IOException if the information is not read    * successfully   */   protected boolean getInstanceFull(StreamTokenizer tokenizer, 				    boolean flag)        throws IOException {    double[] instance = new double[numAttributes()];    int index;        // Get values for all attributes.    for (int i = 0; i < numAttributes(); i++){            // Get next token      if (i > 0) {	getNextToken(tokenizer);      }                  // Check if value is missing.      if  (tokenizer.ttype == '?') {	instance[i] = Instance.missingValue();      } else {	// Check if token is valid.	if (tokenizer.ttype != StreamTokenizer.TT_WORD) {	  errms(tokenizer,"not a valid value");	}        switch (attribute(i).type()) {        case Attribute.NOMINAL:	  // Check if value appears in header.	  index = attribute(i).indexOfValue(tokenizer.sval);	  if (index == -1) {	    errms(tokenizer,"nominal value not declared in header");	  }	  instance[i] = (double)index;          break;	case Attribute.NUMERIC:	  // Check if value is really a number.	  try{	    instance[i] = Double.valueOf(tokenizer.sval).	      doubleValue();	  } catch (NumberFormatException e) {	    errms(tokenizer,"number expected");	  }          break;	case Attribute.STRING:	  instance[i] = attribute(i).addStringValue(tokenizer.sval);          break;        case Attribute.DATE:          try {            instance[i] = attribute(i).parseDate(tokenizer.sval);          } catch (ParseException e) {            errms(tokenizer,"unparseable date: " + tokenizer.sval);          }          break;        case Attribute.RELATIONAL:          StringReader reader = new StringReader(tokenizer.sval);          StreamTokenizer innerTokenizer = new StreamTokenizer(reader);          initTokenizer(innerTokenizer);          Instances data = new Instances(attribute(i).relation(), 100);          // Allocate buffers in case sparse instances have to be read          data.m_ValueBuffer = new double[data.numAttributes()];          data.m_IndicesBuffer = new int[data.numAttributes()];          while (data.getInstance(innerTokenizer, true)) {};          data.compactify();          instance[i] = attribute(i).addRelation(data);          break;        default:          errms(tokenizer,"unknown attribute type in column " + i);	}      }    }    if (flag) {      getLastToken(tokenizer,true);    }          // Add instance to dataset    add(new Instance(1, instance));    return true;  }  /**   * Reads and stores header of an ARFF file.   *   * @param tokenizer the stream tokenizer   * @throws IOException if the information is not read    * successfully   */   protected void readHeader(StreamTokenizer tokenizer)      throws IOException {        // Get name of relation.    getFirstToken(tokenizer);    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {      errms(tokenizer,"premature end of file");    }    if (ARFF_RELATION.equalsIgnoreCase(tokenizer.sval)) {      getNextToken(tokenizer);      m_RelationName = tokenizer.sval;      getLastToken(tokenizer,false);    } else {      errms(tokenizer,"keyword " + ARFF_RELATION + " expected");    }    // Create vectors to hold information temporarily.    m_Attributes = new FastVector();     // Get attribute declarations.    getFirstToken(tokenizer);    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {      errms(tokenizer,"premature end of file");    }    while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) {      parseAttribute(tokenizer);    }    // Check if data part follows. We can't easily check for EOL.    if (!ARFF_DATA.equalsIgnoreCase(tokenizer.sval)) {      errms(tokenizer,"keyword " + ARFF_DATA + " expected");    }        // Check if any attributes have been declared.    if (m_Attributes.size() == 0) {      errms(tokenizer,"no attributes declared");    }    // Allocate buffers in case sparse instances have to be read    m_ValueBuffer = new double[numAttributes()];    m_IndicesBuffer = new int[numAttributes()];  }  /**   * Parses the attribute declaration.   *   * @param tokenizer the stream tokenizer   * @throws IOException if the information is not read    * successfully   */  protected void parseAttribute(StreamTokenizer tokenizer)     throws IOException {    String attributeName;    FastVector attributeValues;    // Get attribute name.    getNextToken(tokenizer);    attributeName = tokenizer.sval;    getNextToken(tokenizer);        // Check if attribute is nominal.    if (tokenizer.ttype == StreamTokenizer.TT_WORD) {            // Attribute is real, integer, or string.      if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) ||          tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) ||          tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) {        m_Attributes.addElement(new Attribute(attributeName, numAttributes()));        readTillEOL(tokenizer);      } else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) {        m_Attributes.          addElement(new Attribute(attributeName, (FastVector)null,                                   numAttributes()));        readTillEOL(tokenizer);      } else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) {        String format = null;        if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {          if ((tokenizer.ttype != StreamTokenizer.TT_WORD) &&              (tokenizer.ttype != '\'') &&              (tokenizer.ttype != '\"')) {            errms(tokenizer,"not a valid date format");          }          format = tokenizer.sval;          readTillEOL(tokenizer);        } else {          tokenizer.pushBack();        }        m_Attributes.addElement(new Attribute(attributeName, format,                                              numAttributes()));              } else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_RELATIONAL)) {        readTillEOL(tokenizer);                // Read attributes for subrelation        // First, save current set of attributes        FastVector atts = m_Attributes;        m_Attributes = new FastVector();                // Now, read attributes until we hit end of declaration of relational value        getFirstToken(tokenizer);        if (tokenizer.ttype == StreamTokenizer.TT_EOF) {          errms(tokenizer,"premature end of file");        }        do {          if (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) {            parseAttribute(tokenizer);          } else if (Attribute.ARFF_END_SUBRELATION.equalsIgnoreCase(tokenizer.sval)) {            getNextToken(tokenizer);            if (!attributeName.equalsIgnoreCase(tokenizer.sval)) {              errms(tokenizer, "declaration of subrelation " + attributeName +                     " must be terminated by " + "@end " + attributeName);            }            break;          } else {            errms(tokenizer, "declaration of subrelation " + attributeName +                   " must be terminated by " + "@end " + attributeName);          }        } while (true);                // Make relation and restore original set of attributes        Instances relation = new Instances(attributeName, m_Attributes, 0);        m_Attributes = atts;        m_Attributes.addElement(new Attribute(attributeName, relation,                                              numAttributes()));      } else {        errms(tokenizer,"no valid attribute type or invalid "+              "enumeration");      }    } else {            // Attribute is nominal.      attributeValues = new FastVector();      tokenizer.pushBack();            // Get values for nominal attribute.      if (tokenizer.nextToken() != '{') {        errms(tokenizer,"{ expected at beginning of enumeration");      }      while (tokenizer.nextToken() != '}') {        if (tokenizer.ttype == StreamTokenizer.TT_EOL) {          errms(tokenizer,"} expected at end of enumeration");        } else {          attributeValues.addElement(tokenizer.sval);        }      }      if (attributeValues.size() == 0) {        errms(tokenizer,"no nominal values found");      }      m_Attributes.        addElement(new Attribute(attributeName, attributeValues,                                 numAttributes()));    }    getLastToken(tokenizer,false);    getFirstToken(tokenizer);    if (tokenizer.ttype == StreamTokenizer.TT_EOF)      errms(tokenizer,"premature end of file");  }  /**   * Copies instances from one set to the end of another    * one.   *   * @param from the position of the first instance to be copied   * @param dest the destination for the instances   * @param num the number of instances to be copied   */  //@ requires 0 <= from && from <= numInstances() - num;  //@ requires 0 <= num;  protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) {        for (int i = 0; i < num; i++) {      dest.add(instance(from + i));    }  }    /**   * Throws error message with line number and last token read.   *   * @param theMsg the error message to be thrown   * @param tokenizer the stream tokenizer   * @throws IOExcpetion containing the error message   */  protected void errms(StreamTokenizer tokenizer, String theMsg)        throws IOException {        throw new IOException(theMsg + ", read " + tokenizer.toString());  }    /**   * Replaces the attribute information by a clone of   * itself.   */  protected void freshAttributeInfo() {    m_Attributes = (FastVector) m_Attributes.copyElements();  }  /**   * Gets next token, skipping empty lines.   *   * @param tokenizer the stream tokenizer   * @throws IOException if reading the next token fails   */  protected void getFirstToken(StreamTokenizer tokenizer)     throws IOException {        while (tokenizer.nextToken() == StreamTokenizer.TT_EOL){};    if ((tokenizer.ttype == '\'') ||	(tokenizer.ttype == '"')) {      tokenizer.ttype = StreamTokenizer.TT_WORD;    } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&	       (tokenizer.sval.equals("?"))){      tokenizer.ttype = '?';    }  }  /**   * Gets index, checking for a premature and of line.   *   * @param tokenizer the stream tokenizer   * @throws IOException if it finds a premature end of line   */  protected void getIndex(StreamTokenizer tokenizer) throws IOException {        if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {      errms(tokenizer,"premature end of line");    }    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {      errms(tokenizer,"premature end of file");    }  }    /**   * Gets token and checks if its end of line.   *   * @param tokenizer the stream tokenizer   * @throws IOException if it doesn't find an end of line   */  protected void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk)        throws IOException {    if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&	((tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {      errms(tokenizer,"end of line expected");    }  }  /**   * Gets next token, checking for a premature and of line.   *   * @param tokenizer the stream tokenizer   * @throws IOException if it finds a premature end of line   */  protected void getNextToken(StreamTokenizer tokenizer)        throws IOException {        if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {      errms(tokenizer,"premature end of line");    }    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {      errms(tokenizer,"premature end of file");    } else if ((tokenizer.ttype == '\'') ||	       (tokenizer.ttype == '"')) {      tokenizer.ttype = StreamTokenizer.TT_WORD;    } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&	       (tokenizer.sval.equals("?"))){      tokenizer.ttype = '?';    }  }	  /**   * Initializes the StreamTokenizer used for reading the ARFF file.   *   * @param tokenizer the stream tokenizer   */  protected void initTokenizer(StreamTokenizer tokenizer){    tokenizer.resetSyntax();             tokenizer.whitespaceChars(0, ' ');        tokenizer.wordChars(' '+1,'\u00FF');    tokenizer.whitespaceChars(',',',');    tokenizer.commentChar('%');    tokenizer.quoteChar('"');    tokenizer.quoteChar('\'');    tokenizer.ordinaryChar('{');    tokenizer.ordinaryChar('}');    tokenizer.eolIsSignificant(true);  }   /**   * Returns string including all instances, their weights and   * their indices in the original dataset.   *   * @return description of instance and its weight as a string   */  protected /*@pure@*/ String instancesAndWeights(){    StringBuffer text = new StringBuffer();    for (int i = 0; i < numInstances(); i++) {      text.append(instance(i) + " " + instance(i).weight());      if (i < numInstances() - 1) {	text.append("\n");      }    }    return text.toString();  }    /**   * Partitions the instances around a pivot. Used by quicksort and   * kthSmallestValue.   *   * @param attIndex the attribute's index (index starts with 0)   * @param left the first index of the subset (index starts with 0)   * @param right the last index of the subset (index starts with 0)   *   * @return the index of the middle element   */  /
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -