📄 instances.java

📁 一个数据挖掘系统的源码
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
    }
    if (numFolds > numInstances()) {
      throw new Exception("Can't have more folds than instances!");
    }
    numInstForFold = numInstances() / numFolds;
    if (numFold < numInstances() % numFolds) {
      numInstForFold++;
      offset = numFold;
    }else
      offset = numInstances() % numFolds;
    train = new Instances(this, numInstances() - numInstForFold);
    first = numFold * (numInstances() / numFolds) + offset;
    copyInstances(0, train, first);
    copyInstances(first + numInstForFold, train,
		  numInstances() - first - numInstForFold);

    return train;
  }

  /**
   * Computes the variance for a numeric attribute.
   *
   * @param attIndex the numeric attribute
   * @return the variance if the attribute is numeric
   * @exception Exception if the attribute is not numeric
   */
  public final double variance(int attIndex) throws Exception {

    double sum = 0, sumSquared = 0, sumOfWeights = 0;

    if (!attribute(attIndex).isNumeric()) {
      throw new Exception("Can't compute variance because attribute is " +
			  "not numeric!");
    }
    for (int i = 0; i < numInstances(); i++) {
      if (!instance(i).isMissing(attIndex)) {
	sum += instance(i).weight() *
	  instance(i).value(attIndex);
	sumSquared += instance(i).weight() *
	  instance(i).value(attIndex) *
	  instance(i).value(attIndex);
	sumOfWeights += instance(i).weight();
      }
    }
    if (Utils.smOrEq(sumOfWeights, 1)) {
      return 0;
    }
    return (sumSquared - (sum * sum / sumOfWeights)) /
      (sumOfWeights - 1);
  }

  /**
   * Computes the variance for a numeric attribute.
   *
   * @param att the numeric attribute
   * @return the variance if the attribute is numeric
   * @exception Exception if the attribute is not numeric
   */
  public final double variance(Attribute att) throws Exception {

    return variance(att.index());
  }

  /**
   * Calculates summary statistics on the values that appear in this
   * set of instances for a specified attribute.
   *
   * @param index the index of the attribute to summarize.
   * @return an AttributeStats object with it's fields calculated.
   */
  public AttributeStats attributeStats(int index) {

    AttributeStats result = new AttributeStats();
    if (attribute(index).isNominal()) {
      result.nominalCounts = new int [attribute(index).numValues()];
    }
    if (attribute(index).isNumeric()) {
      result.numericStats = new org.agentacademy.modules.dataminer.experiment.Stats ();
    }
    result.totalCount = numInstances();

    double [] attVals = attributeToDoubleArray(index);
    int [] sorted = Utils.sort(attVals);
    int currentCount = 0;
    double prev = Instance.missingValue();
    for (int j = 0; j < numInstances(); j++) {
      Instance current = instance(sorted[j]);
      if (current.isMissing(index)) {
	result.missingCount = numInstances() - j;
	break;
      }
      if (Utils.eq(current.value(index), prev)) {
	currentCount++;
      } else {
	result.addDistinct(prev, currentCount);
	currentCount = 1;
	prev = current.value(index);
      }
    }
    result.addDistinct(prev, currentCount);
    result.distinctCount--; // So we don't count "missing" as a value
    return result;
  }

  /**
   * Gets the value of all instances in this dataset for a particular
   * attribute. Useful in conjunction with Utils.sort to allow iterating
   * through the dataset in sorted order for some attribute.
   *
   * @param index the index of the attribute.
   * @return an array containing the value of the desired attribute for
   * each instance in the dataset.
   */
  public double [] attributeToDoubleArray(int index) {

    double [] result = new double[numInstances()];
    for (int i = 0; i < result.length; i++) {
      result[i] = instance(i).value(index);
    }
    return result;
  }

  /**
   * Generates a string summarizing the set of instances. Gives a breakdown
   * for each attribute indicating the number of missing/discrete/unique
   * values and other information.
   *
   * @return a string summarizing the dataset
   */
  public String toSummaryString() {

    StringBuffer result = new StringBuffer();
    result.append("Relation Name:  ").append(relationName()).append('\n');
    result.append("Num Instances:  ").append(numInstances()).append('\n');
    result.append("Num Attributes: ").append(numAttributes()).append('\n');
    result.append('\n');

    result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
    result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
    result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
    result.append(Utils.padLeft("Missing", 12));
    result.append(Utils.padLeft("Unique", 12));
    result.append(Utils.padLeft("Dist", 6)).append('\n');
    for (int i = 0; i < numAttributes(); i++) {
      Attribute a = attribute(i);
      AttributeStats as = attributeStats(i);
      result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
      result.append(Utils.padRight(a.name(), 25)).append(' ');
      long percent;
      switch (a.type()) {
      case Attribute.NOMINAL:
	result.append(Utils.padLeft("Nom", 4)).append(' ');
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      case Attribute.NUMERIC:
	result.append(Utils.padLeft("Num", 4)).append(' ');
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      case Attribute.STRING:
	result.append(Utils.padLeft("Str", 4)).append(' ');
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      default:
	result.append(Utils.padLeft("???", 4)).append(' ');
	result.append(Utils.padLeft("" + 0, 3)).append("% ");
	percent = Math.round(100.0 * as.intCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	percent = Math.round(100.0 * as.realCount / as.totalCount);
	result.append(Utils.padLeft("" + percent, 3)).append("% ");
	break;
      }
      result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
      percent = Math.round(100.0 * as.missingCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
      percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
      result.append('\n');
    }
    return result.toString();
  }

  /**
   * Reads a single instance using the tokenizer and appends it
   * to the dataset. Automatically expands the dataset if it
   * is not large enough to hold the instance.
   *
   * @param tokenizer the tokenizer to be used
   * @param flag if method should test for carriage return after
   * each instance
   * @return false if end of file has been reached
   * @exception IOException if the information is not read
   * successfully
   */
  protected boolean getInstance(StreamTokenizer tokenizer,
				boolean flag)
       throws IOException {

    // Check if any attributes have been declared.
    if (m_Attributes.size() == 0) {
      errms(tokenizer,"no header information available");
    }

    // Check if end of file reached.
    getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
      return false;
    }

    // Parse instance
    if (tokenizer.ttype == '{') {
      return getInstanceSparse(tokenizer, flag);
    } else {
      return getInstanceFull(tokenizer, flag);
    }
  }

  /**
   * Reads a single instance using the tokenizer and appends it
   * to the dataset. Automatically expands the dataset if it
   * is not large enough to hold the instance.
   *
   * @param tokenizer the tokenizer to be used
   * @param flag if method should test for carriage return after
   * each instance
   * @return false if end of file has been reached
   * @exception IOException if the information is not read
   * successfully
   */
  protected boolean getInstanceSparse(StreamTokenizer tokenizer,
				      boolean flag)
       throws IOException {

    int valIndex, numValues = 0, maxIndex = -1;

    // Get values
    do {

      // Get index
      getIndex(tokenizer);
      if (tokenizer.ttype == '}') {
	break;
      }

      // Is index valid?
      try{
	m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();
      } catch (NumberFormatException e) {
	errms(tokenizer,"index number expected");
      }
      if (m_IndicesBuffer[numValues] <= maxIndex) {
	errms(tokenizer,"indices have to be ordered");
      }
      if ((m_IndicesBuffer[numValues] < 0) ||
	  (m_IndicesBuffer[numValues] >= numAttributes())) {
	errms(tokenizer,"index out of bounds");
      }
      maxIndex = m_IndicesBuffer[numValues];

      // Get value;
      getNextToken(tokenizer);

      // Check if value is missing.
      if  (tokenizer.ttype == '?') {
	m_ValueBuffer[numValues] = Instance.missingValue();
      } else {

	// Check if token is valid.
	if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
	  errms(tokenizer,"not a valid value");
	}
	if (attribute(m_IndicesBuffer[numValues]).isNominal()) {

	  // Check if value appears in header.
	  valIndex =
	    attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);
	  if (valIndex == -1) {
	    errms(tokenizer,"nominal value not declared in header");
	  }
	  m_ValueBuffer[numValues] = (double)valIndex;
	} else if (attribute(m_IndicesBuffer[numValues]).isNumeric()) {

	  // Check if value is really a number.
	  try{
	    m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).
	      doubleValue();
	  } catch (NumberFormatException e) {
	    errms(tokenizer,"number expected");
	  }
	} else {
	  m_ValueBuffer[numValues] =
	    attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);
	}
      }
      numValues++;
    } while (true);
    if (flag) {
      getLastToken(tokenizer,true);
    }

    // Add instance to dataset
    double[] tempValues = new double[numValues];
    int[] tempIndices = new int[numValues];
    System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
    System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
    add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
    return true;
  }
  /**
   * Reads a single instance using the tuples and appends it
   * to the dataset. Automatically expands the dataset if it
   * is not large enough to hold the instance.
   *
   * @param tuple the node to be searched
   * @return true if everything is ok
   * @exception IOException if the information is not read
   * successfully
   * by asymeon
   */
  protected boolean getInstance(Element tuple) throws IOException {

      double[] instance = new double[numAttributes()];
      int index;
      // System.out.println("Instance length= " + instance.length);
      Element tupleElement = tuple;
      // System.out.println(tupleElement.toString());
      List tupleContentList = tupleElement.getChildren("ATTRVALUE");
      Iterator attrValueIterator = tupleContentList.iterator();
      // Get the ATTRVALUEs in one TUPLE
      while (attrValueIterator.hasNext()){
        Element attrValueElement = (Element) attrValueIterator.next();
        // org.jdom.Attribute attr = attrValueElement.getAttribute("name");
        // System.out.println(attrValueElement.toString());
        String attrValueName = attrValueElement.getAttributeValue("name");
        org.agentacademy.modules.dataminer.core.Attribute coreAttr = attribute(attrValueName);
        if (coreAttr == null) {
            log.error("An error has occured: The " + attrValueName + " attribute has not been declared in header");
        }
        else {
            int i = coreAttr.index();
            // System.out.println("CoreAttr index= " + coreAttr.index());
            // System.out.println("CoreAttr=" + coreAttr.toString());
            String attrValueText = attrValueElement.getText();
            String attrValueName2 = attrValueElement.getName();
            // System.out.println("Attribute Value Name= " + attrValueName2);
            // System.out.println("Attribute Value Text= " + attrValueText);
            if (attrValueText.equalsIgnoreCase("?")) {
                instance[i] = Instance.missingValue();
            }
            else if (coreAttr.isNominal()) {
              index =  attribute(i).indexOfValue(attrValueText);
              if (index == -1) {
                  log.error("Error occured: The " + attrValueText + " nominal value is not declared in header");
              }
              instance[i] = (double)index;
            }
            else if (coreAttr.isNumeric()){
              // Check if value is really a number.
                try{
                  instance[i] = Double.valueOf(attrValueText).doubleValue();
                }
                catch (NumberFormatException e) {
                  log.error("Error occured: For attribute " + attrValueText + " number expected");
                }
            }
            else {
              instance[i] = coreAttr.addStringValue(attrValueText);
            }
        } //end of if - else
      } //end of while
      add(new Instance (1 , instance));
      return true;
    }

  /**
   * Reads a single instance using the tokenizer and appends it
   * to the dataset. Automatically expands the dataset if it
   * is not large enough to hold the instance.
   *
   * @param tokenizer the tokenizer to be used
   * @param flag if method should test for carriage return after
   * each instance
   * @return false if end of file has been reached
   * @exception IOException if the information is not read
   * successfully
   */
  protected boolean getInstanceFull(StreamTokenizer tokenizer,
				    boolean flag)
       throws IOException {

    double[] instance = new double[numAttributes()];
    int index;

    // Get values for all attributes.
    for (int i = 0; i < numAttributes(); i++){

      // Get next token
      if (i > 0) {
	getNextToken(tokenizer);
      }

      // Check if value is missing.
      if  (tokenizer.ttype == '?') {
	instance[i] = Instance.missingValue();
      } else {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -