📄 instances.java

📁 Java 编写的多种数据挖掘算法包括聚类、分类、预处理等
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
      if (i < numInstances() - 1) {	text.append('\n');      }    }    return text.toString();  }  /**   * Creates the training set for one fold of a cross-validation    * on the dataset.    *   * @param numFolds the number of folds in the cross-validation. Must   * be greater than 1.   * @param numFold 0 for the first fold, 1 for the second, ...   * @return the training set    * @throws IllegalArgumentException if the number of folds is less than 2   * or greater than the number of instances.   */  //@ requires 2 <= numFolds && numFolds < numInstances();  //@ requires 0 <= numFold && numFold < numFolds;  public Instances trainCV(int numFolds, int numFold) {    int numInstForFold, first, offset;    Instances train;     if (numFolds < 2) {      throw new IllegalArgumentException("Number of folds must be at least 2!");    }    if (numFolds > numInstances()) {      throw new IllegalArgumentException("Can't have more folds than instances!");    }    numInstForFold = numInstances() / numFolds;    if (numFold < numInstances() % numFolds) {      numInstForFold++;      offset = numFold;    }else      offset = numInstances() % numFolds;    train = new Instances(this, numInstances() - numInstForFold);    first = numFold * (numInstances() / numFolds) + offset;    copyInstances(0, train, first);    copyInstances(first + numInstForFold, train,		  numInstances() - first - numInstForFold);    return train;  }  /**   * Creates the training set for one fold of a cross-validation    * on the dataset. The data is subsequently randomized based   * on the given random number generator.   *   * @param numFolds the number of folds in the cross-validation. Must   * be greater than 1.   * @param numFold 0 for the first fold, 1 for the second, ...   * @param random the random number generator   * @return the training set    * @throws IllegalArgumentException if the number of folds is less than 2   * or greater than the number of instances.   */  //@ requires 2 <= numFolds && numFolds < numInstances();  //@ requires 0 <= numFold && numFold < numFolds;  public Instances trainCV(int numFolds, int numFold, Random random) {    Instances train = trainCV(numFolds, numFold);    train.randomize(random);    return train;  }  /**   * Computes the variance for a numeric attribute.   *   * @param attIndex the numeric attribute (index starts with 0)   * @return the variance if the attribute is numeric   * @throws IllegalArgumentException if the attribute is not numeric   */  public /*@pure@*/ double variance(int attIndex) {      double sum = 0, sumSquared = 0, sumOfWeights = 0;    if (!attribute(attIndex).isNumeric()) {      throw new IllegalArgumentException("Can't compute variance because attribute is " +			  "not numeric!");    }    for (int i = 0; i < numInstances(); i++) {      if (!instance(i).isMissing(attIndex)) {	sum += instance(i).weight() * 	  instance(i).value(attIndex);	sumSquared += instance(i).weight() * 	  instance(i).value(attIndex) *	  instance(i).value(attIndex);	sumOfWeights += instance(i).weight();      }    }    if (sumOfWeights <= 1) {      return 0;    }    double result = (sumSquared - (sum * sum / sumOfWeights)) /       (sumOfWeights - 1);    // We don't like negative variance    if (result < 0) {      return 0;    } else {      return result;    }  }  /**   * Computes the variance for a numeric attribute.   *   * @param att the numeric attribute   * @return the variance if the attribute is numeric   * @throws IllegalArgumentException if the attribute is not numeric   */  public /*@pure@*/ double variance(Attribute att) {        return variance(att.index());  }    /**   * Calculates summary statistics on the values that appear in this   * set of instances for a specified attribute.   *   * @param index the index of the attribute to summarize (index starts with 0)   * @return an AttributeStats object with it's fields calculated.   */  //@ requires 0 <= index && index < numAttributes();  public AttributeStats attributeStats(int index) {    AttributeStats result = new AttributeStats();    if (attribute(index).isNominal()) {      result.nominalCounts = new int [attribute(index).numValues()];    }    if (attribute(index).isNumeric()) {      result.numericStats = new weka.experiment.Stats();    }    result.totalCount = numInstances();    double [] attVals = attributeToDoubleArray(index);    int [] sorted = Utils.sort(attVals);    int currentCount = 0;    double prev = Instance.missingValue();    for (int j = 0; j < numInstances(); j++) {      Instance current = instance(sorted[j]);      if (current.isMissing(index)) {	result.missingCount = numInstances() - j;	break;      }      if (current.value(index) == prev) {	currentCount++;      } else {	result.addDistinct(prev, currentCount);	currentCount = 1;	prev = current.value(index);      }    }    result.addDistinct(prev, currentCount);    result.distinctCount--; // So we don't count "missing" as a value     return result;  }    /**   * Gets the value of all instances in this dataset for a particular   * attribute. Useful in conjunction with Utils.sort to allow iterating   * through the dataset in sorted order for some attribute.   *   * @param index the index of the attribute.   * @return an array containing the value of the desired attribute for   * each instance in the dataset.    */  //@ requires 0 <= index && index < numAttributes();  public /*@pure@*/ double [] attributeToDoubleArray(int index) {    double [] result = new double[numInstances()];    for (int i = 0; i < result.length; i++) {      result[i] = instance(i).value(index);    }    return result;  }  /**   * Generates a string summarizing the set of instances. Gives a breakdown   * for each attribute indicating the number of missing/discrete/unique   * values and other information.   *   * @return a string summarizing the dataset   */  public String toSummaryString() {    StringBuffer result = new StringBuffer();    result.append("Relation Name:  ").append(relationName()).append('\n');    result.append("Num Instances:  ").append(numInstances()).append('\n');    result.append("Num Attributes: ").append(numAttributes()).append('\n');    result.append('\n');    result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));    result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));    result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));    result.append(Utils.padLeft("Missing", 12));    result.append(Utils.padLeft("Unique", 12));    result.append(Utils.padLeft("Dist", 6)).append('\n');    for (int i = 0; i < numAttributes(); i++) {      Attribute a = attribute(i);      AttributeStats as = attributeStats(i);      result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');      result.append(Utils.padRight(a.name(), 25)).append(' ');      long percent;      switch (a.type()) {      case Attribute.NOMINAL:	result.append(Utils.padLeft("Nom", 4)).append(' ');	percent = Math.round(100.0 * as.intCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	result.append(Utils.padLeft("" + 0, 3)).append("% ");	percent = Math.round(100.0 * as.realCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	break;      case Attribute.NUMERIC:	result.append(Utils.padLeft("Num", 4)).append(' ');	result.append(Utils.padLeft("" + 0, 3)).append("% ");	percent = Math.round(100.0 * as.intCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	percent = Math.round(100.0 * as.realCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	break;      case Attribute.DATE:	result.append(Utils.padLeft("Dat", 4)).append(' ');	result.append(Utils.padLeft("" + 0, 3)).append("% ");	percent = Math.round(100.0 * as.intCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	percent = Math.round(100.0 * as.realCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	break;      case Attribute.STRING:	result.append(Utils.padLeft("Str", 4)).append(' ');	percent = Math.round(100.0 * as.intCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	result.append(Utils.padLeft("" + 0, 3)).append("% ");	percent = Math.round(100.0 * as.realCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	break;      case Attribute.RELATIONAL:	result.append(Utils.padLeft("Rel", 4)).append(' ');	percent = Math.round(100.0 * as.intCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	result.append(Utils.padLeft("" + 0, 3)).append("% ");	percent = Math.round(100.0 * as.realCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	break;      default:	result.append(Utils.padLeft("???", 4)).append(' ');	result.append(Utils.padLeft("" + 0, 3)).append("% ");	percent = Math.round(100.0 * as.intCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	percent = Math.round(100.0 * as.realCount / as.totalCount);	result.append(Utils.padLeft("" + percent, 3)).append("% ");	break;      }      result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");      percent = Math.round(100.0 * as.missingCount / as.totalCount);      result.append(Utils.padLeft("" + percent, 3)).append("% ");      result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");      percent = Math.round(100.0 * as.uniqueCount / as.totalCount);      result.append(Utils.padLeft("" + percent, 3)).append("% ");      result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');      result.append('\n');    }    return result.toString();  }    /**   * Reads a single instance using the tokenizer and appends it   * to the dataset. Automatically expands the dataset if it   * is not large enough to hold the instance.   *   * @param tokenizer the tokenizer to be used   * @param flag if method should test for carriage return after    * each instance   * @return false if end of file has been reached   * @throws IOException if the information is not read    * successfully   */   protected boolean getInstance(StreamTokenizer tokenizer, 				boolean flag)        throws IOException {        // Check if any attributes have been declared.    if (m_Attributes.size() == 0) {      errms(tokenizer,"no header information available");    }    // Check if end of file reached.    getFirstToken(tokenizer);    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {      return false;    }        // Parse instance    if (tokenizer.ttype == '{') {      return getInstanceSparse(tokenizer, flag);    } else {      return getInstanceFull(tokenizer, flag);    }  }  /**   * Reads a single instance using the tokenizer and appends it   * to the dataset. Automatically expands the dataset if it   * is not large enough to hold the instance.   *   * @param tokenizer the tokenizer to be used   * @param flag if method should test for carriage return after    * each instance   * @return false if end of file has been reached   * @throws IOException if the information is not read    * successfully   */   protected boolean getInstanceSparse(StreamTokenizer tokenizer, 				      boolean flag)        throws IOException {    int valIndex, numValues = 0, maxIndex = -1;        // Get values    do {            // Get index      getIndex(tokenizer);      if (tokenizer.ttype == '}') {	break;      }       // Is index valid?      try{	m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();      } catch (NumberFormatException e) {	errms(tokenizer,"index number expected");      }      if (m_IndicesBuffer[numValues] <= maxIndex) {	errms(tokenizer,"indices have to be ordered");      }      if ((m_IndicesBuffer[numValues] < 0) || 	  (m_IndicesBuffer[numValues] >= numAttributes())) {	errms(tokenizer,"index out of bounds");      }      maxIndex = m_IndicesBuffer[numValues];      // Get value;      getNextToken(tokenizer);      // Check if value is missing.      if  (tokenizer.ttype == '?') {	m_ValueBuffer[numValues] = Instance.missingValue();      } else {	// Check if token is valid.	if (tokenizer.ttype != StreamTokenizer.TT_WORD) {	  errms(tokenizer,"not a valid value");	}        switch (attribute(m_IndicesBuffer[numValues]).type()) {          case Attribute.NOMINAL:            // Check if value appears in header.            valIndex =               attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);            if (valIndex == -1) {              errms(tokenizer,"nominal value not declared in header");            }            m_ValueBuffer[numValues] = (double)valIndex;            break;	case Attribute.NUMERIC:	  // Check if value is really a number.	  try{	    m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).	      doubleValue();	  } catch (NumberFormatException e) {	    errms(tokenizer,"number expected");	  }          break;	case Attribute.STRING:	  m_ValueBuffer[numValues] = 	    attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);          break;        case Attribute.DATE:          try {            m_ValueBuffer[numValues] =               attribute(m_IndicesBuffer[numValues]).parseDate(tokenizer.sval);          } catch (ParseException e) {            errms(tokenizer,"unparseable date: " + tokenizer.sval);          }          break;        case Attribute.RELATIONAL:          StringReader reader = new StringReader(tokenizer.sval);          StreamTokenizer innerTokenizer = new StreamTokenizer(reader);          initTokenizer(innerTokenizer);          Instances data = new Instances(attribute(m_IndicesBuffer[numValues]).relation(), 100);          // Allocate buffers in case sparse instances have to be read          data.m_ValueBuffer = new double[data.numAttributes()];          data.m_IndicesBuffer = new int[data.numAttributes()];          while (data.getInstance(innerTokenizer, true)) {};          data.compactify();          m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).addRelation(data);          break;        default:          errms(tokenizer,"unknown attribute type in column " + m_IndicesBuffer[numValues]);	}      }      numValues++;    } while (true);    if (flag) {      getLastToken(tokenizer,true);    }          // Add instance to dataset    double[] tempValues = new double[numValues];    int[] tempIndices = new int[numValues];    System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);    System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);    add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));    return true;  }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -