📄 instances.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 5 页
字号:

  /**
   * Computes the variance for a numeric attribute.
   *
   * @param att the numeric attribute
   * @return the variance if the attribute is numeric
   * @exception IllegalArgumentException if the attribute is not numeric
   */
  public final double variance(Attribute att) {
    
    return variance(att.index());
  }
  
  /**
   * Calculates summary statistics on the values that appear in this
   * set of instances for a specified attribute.
   *
   * @param index the index of the attribute to summarize.
   * @return an AttributeStats object with it's fields calculated.
   */
  public AttributeStats attributeStats(int index) {

    AttributeStats result = new AttributeStats();
    if (attribute(index).isNominal()) {
      result.nominalCounts = new int [attribute(index).numValues()];
    }
    if (attribute(index).isNumeric()) {
      result.numericStats = new weka.experiment.Stats();
    }
    result.totalCount = numInstances();

    double [] attVals = attributeToDoubleArray(index);
    int [] sorted = Utils.sort(attVals);
    int currentCount = 0;
    double prev = Instance.missingValue();
    for (int j = 0; j < numInstances(); j++) {
      Instance current = instance(sorted[j]);
      if (current.isMissing(index)) {
	result.missingCount = numInstances() - j;
	break;
      }
      if (current.value(index) == prev) {
	currentCount++;
      } else {
	result.addDistinct(prev, currentCount);
	currentCount = 1;
	prev = current.value(index);
      }
    }
    result.addDistinct(prev, currentCount);
    result.distinctCount--; // So we don't count "missing" as a value 
    return result;
  }
  
  /**
   * Gets the value of all instances in this dataset for a particular
   * attribute. Useful in conjunction with Utils.sort to allow iterating
   * through the dataset in sorted order for some attribute.
   *
   * @param index the index of the attribute.
   * @return an array containing the value of the desired attribute for
   * each instance in the dataset. 
   */
  public double [] attributeToDoubleArray(int index) {

    double [] result = new double[numInstances()];
    for (int i = 0; i < result.length; i++) {
      result[i] = instance(i).value(index);
    }
    return result;
  }

  /**
   * Generates a string summarizing the set of instances. Gives a breakdown
   * for each attribute indicating the number of missing/discrete/unique
   * values and other information.
   *
   * @return a string summarizing the dataset
   */
  public String toSummaryString() {

    StringBuffer result = new StringBuffer();
    result.append("Relation Name:  ").append(relationName()).append('\n');
    result.append("Num Instances:  ").append(numInstances()).append('\n');
    result.append("Num Attributes: ").append(numAttributes()).append('\n');
    result.append('\n');

    result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
    result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
    result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
    result.append(Utils.padLeft("Missing", 12));
    result.append(Utils.padLeft("Unique", 12));
    result.append(Utils.padLeft("Dist", 6)).append('\n');
    for (int i = 0; i < numAttributes(); i++) {
      Attribute a = attribute(i);
      AttributeStats as = attributeStats(i);
      result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
      result.append(Utils.padRight(a.name(), 25)).append(' ');
      long percent;
      switch (a.type()) {
      case Attribute.NOMINAL:
	result.append(Utils.padLeft("Nom", 4)).append(' ');
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      case Attribute.NUMERIC:
	result.append(Utils.padLeft("Num", 4)).append(' ');
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      case Attribute.DATE:
	result.append(Utils.padLeft("Dat", 4)).append(' ');
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      case Attribute.STRING:
	result.append(Utils.padLeft("Str", 4)).append(' ');
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      default:
	result.append(Utils.padLeft("???", 4)).append(' ');
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      }
      result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
      percent = Math.round(100.0 * as.missingCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
      percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
      result.append('\n');
    }
    return result.toString();
  }
  
  /**
   * Reads a single instance using the tokenizer and appends it
   * to the dataset. Automatically expands the dataset if it
   * is not large enough to hold the instance.
   *
   * @param tokenizer the tokenizer to be used
   * @param flag if method should test for carriage return after 
   * each instance
   * @return false if end of file has been reached
   * @exception IOException if the information is not read 
   * successfully
   */ 
  protected boolean getInstance(StreamTokenizer tokenizer, 
				boolean flag) 
       throws IOException {
    
    // Check if any attributes have been declared.
    if (m_Attributes.size() == 0) {
      errms(tokenizer,"no header information available");
    }

    // Check if end of file reached.
    getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      return false;
    }
    
    // Parse instance
    if (tokenizer.ttype == '{') {
      return getInstanceSparse(tokenizer, flag);
    } else {
      return getInstanceFull(tokenizer, flag);
    }
  }

  /**
   * Reads a single instance using the tokenizer and appends it
   * to the dataset. Automatically expands the dataset if it
   * is not large enough to hold the instance.
   *
   * @param tokenizer the tokenizer to be used
   * @param flag if method should test for carriage return after 
   * each instance
   * @return false if end of file has been reached
   * @exception IOException if the information is not read 
   * successfully
   */ 
  protected boolean getInstanceSparse(StreamTokenizer tokenizer, 
				      boolean flag) 
       throws IOException {

    int valIndex, numValues = 0, maxIndex = -1;
    
    // Get values
    do {
      
      // Get index
      getIndex(tokenizer);
      if (tokenizer.ttype == '}') {
	break;
      }
       
      // Is index valid?
      try{
	m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();
      } catch (NumberFormatException e) {
	errms(tokenizer,"index number expected");
      }
      if (m_IndicesBuffer[numValues] <= maxIndex) {
	errms(tokenizer,"indices have to be ordered");
      }
      if ((m_IndicesBuffer[numValues] < 0) || 
	  (m_IndicesBuffer[numValues] >= numAttributes())) {
	errms(tokenizer,"index out of bounds");
      }
      maxIndex = m_IndicesBuffer[numValues];

      // Get value;
      getNextToken(tokenizer);

      // Check if value is missing.
      if  (tokenizer.ttype == '?') {
	m_ValueBuffer[numValues] = Instance.missingValue();
      } else {

	// Check if token is valid.
	if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
	  errms(tokenizer,"not a valid value");
	}
        switch (attribute(m_IndicesBuffer[numValues]).type()) {
          case Attribute.NOMINAL:
            // Check if value appears in header.
            valIndex = 
              attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);
            if (valIndex == -1) {
              errms(tokenizer,"nominal value not declared in header");
            }
            m_ValueBuffer[numValues] = (double)valIndex;
            break;
	case Attribute.NUMERIC:
	  // Check if value is really a number.
	  try{
	    m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).
	      doubleValue();
	  } catch (NumberFormatException e) {
	    errms(tokenizer,"number expected");
	  }
          break;
	case Attribute.STRING:
	  m_ValueBuffer[numValues] = 
	    attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);
          break;
        case Attribute.DATE:
          try {
            m_ValueBuffer[numValues] = 
              attribute(m_IndicesBuffer[numValues]).parseDate(tokenizer.sval);
          } catch (ParseException e) {
            errms(tokenizer,"unparseable date: " + tokenizer.sval);
          }
          break;
        default:
          errms(tokenizer,"unknown attribute type in column " + m_IndicesBuffer[numValues]);
	}
      }
      numValues++;
    } while (true);
    if (flag) {
      getLastToken(tokenizer,true);
    }
      
    // Add instance to dataset
    double[] tempValues = new double[numValues];
    int[] tempIndices = new int[numValues];
    System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
    System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
    add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
    return true;
  }

  /**
   * Reads a single instance using the tokenizer and appends it
   * to the dataset. Automatically expands the dataset if it
   * is not large enough to hold the instance.
   *
   * @param tokenizer the tokenizer to be used
   * @param flag if method should test for carriage return after 
   * each instance
   * @return false if end of file has been reached
   * @exception IOException if the information is not read 
   * successfully
   */ 
  protected boolean getInstanceFull(StreamTokenizer tokenizer, 
				    boolean flag) 
       throws IOException {

    double[] instance = new double[numAttributes()];
    int index;
    
    // Get values for all attributes.
    for (int i = 0; i < numAttributes(); i++){
      
      // Get next token
      if (i > 0) {
	getNextToken(tokenizer);
      }
            
      // Check if value is missing.
      if  (tokenizer.ttype == '?') {
	instance[i] = Instance.missingValue();
      } else {

	// Check if token is valid.
	if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
	  errms(tokenizer,"not a valid value");
	}
        switch (attribute(i).type()) {
        case Attribute.NOMINAL:
	  // Check if value appears in header.
	  index = attribute(i).indexOfValue(tokenizer.sval);
	  if (index == -1) {
	    errms(tokenizer,"nominal value not declared in header");
	  }
	  instance[i] = (double)index;
          break;
	case Attribute.NUMERIC:
	  // Check if value is really a number.
	  try{
	    instance[i] = Double.valueOf(tokenizer.sval).
	      doubleValue();
	  } catch (NumberFormatException e) {
	    errms(tokenizer,"number expected");
	  }
          break;
	case Attribute.STRING:
	  instance[i] = attribute(i).addStringValue(tokenizer.sval);
          break;
        case Attribute.DATE:
          try {
            instance[i] = attribute(i).parseDate(tokenizer.sval);
          } catch (ParseException e) {
            errms(tokenizer,"unparseable date: " + tokenizer.sval);
          }
          break;
        default:
          errms(tokenizer,"unknown attribute type in column " + i);
	}
      }
    }
    if (flag) {
      getLastToken(tokenizer,true);
    }
      
    // Add instance to dataset
    add(new Instance(1, instance));
    return true;
  }

  /**
   * Reads and stores header of an ARFF file.
   *
   * @param tokenizer the stream tokenizer
   * @exception IOException if the information is not read 
   * successfully
   */ 
  protected void readHeader(StreamTokenizer tokenizer) 
     throws IOException {
    
    String attributeName;
    FastVector attributeValues;
    // Get name of relation.
    getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      errms(tokenizer,"premature end of file");
    }
    if (ARFF_RELATION.equalsIgnoreCase(tokenizer.sval)) {
      getNextToken(tokenizer);
      m_RelationName = tokenizer.sval;
      getLastToken(tokenizer,false);
    } else {
      errms(tokenizer,"keyword " + ARFF_RELATION + " expected");
    }

    // Create vectors to hold information temporarily.
    m_Attributes = new FastVector();
 
    // Get attribute declarations.
    getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      errms(tokenizer,"premature end of file");
    }

    while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) {

      // Get attribute name.
      getNextToken(tokenizer);
      attributeName = tokenizer.sval;
      getNextToken(tokenizer);

      // Check if attribute is nominal.
      if (tokenizer.ttype == StreamTokenizer.TT_WORD) {

	// Attribute is real, integer, or string.
	if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) ||
	    tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) ||
	    tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) {
	  m_Attributes.addElement(new Attribute(attributeName, numAttributes()));
	  readTillEOL(tokenizer);
	} else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) {
	  m_Attributes.
	    addElement(new Attribute(attributeName, (FastVector)null,
				     numAttributes()));
	  readTillEOL(tokenizer);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -