📄 instances.java
字号:
offset = numInstances() % numFolds; test = new Instances(this, numInstForFold); first = numFold * (numInstances() / numFolds) + offset; copyInstances(first, test, numInstForFold); return test; } /** * Returns the dataset as a string in ARFF format. Strings * are quoted if they contain whitespace characters, or if they * are a question mark. * * @return the dataset in ARFF format as a string */ public String toString() { StringBuffer text = new StringBuffer(); text.append(ARFF_RELATION).append(" "). append(Utils.quote(m_RelationName)).append("\n\n"); for (int i = 0; i < numAttributes(); i++) { text.append(attribute(i)).append("\n"); } text.append("\n").append(ARFF_DATA).append("\n"); text.append(stringWithoutHeader()); return text.toString(); } /** * Returns the instances in the dataset as a string in ARFF format. Strings * are quoted if they contain whitespace characters, or if they * are a question mark. * * @return the dataset in ARFF format as a string */ protected String stringWithoutHeader() { StringBuffer text = new StringBuffer(); for (int i = 0; i < numInstances(); i++) { text.append(instance(i)); if (i < numInstances() - 1) { text.append('\n'); } } return text.toString(); } /** * Creates the training set for one fold of a cross-validation * on the dataset. * * @param numFolds the number of folds in the cross-validation. Must * be greater than 1. * @param numFold 0 for the first fold, 1 for the second, ... * @return the training set * @throws IllegalArgumentException if the number of folds is less than 2 * or greater than the number of instances. */ //@ requires 2 <= numFolds && numFolds < numInstances(); //@ requires 0 <= numFold && numFold < numFolds; public Instances trainCV(int numFolds, int numFold) { int numInstForFold, first, offset; Instances train; if (numFolds < 2) { throw new IllegalArgumentException("Number of folds must be at least 2!"); } if (numFolds > numInstances()) { throw new IllegalArgumentException("Can't have more folds than instances!"); } numInstForFold = numInstances() / numFolds; if (numFold < numInstances() % numFolds) { numInstForFold++; offset = numFold; }else offset = numInstances() % numFolds; train = new Instances(this, numInstances() - numInstForFold); first = numFold * (numInstances() / numFolds) + offset; copyInstances(0, train, first); copyInstances(first + numInstForFold, train, numInstances() - first - numInstForFold); return train; } /** * Creates the training set for one fold of a cross-validation * on the dataset. The data is subsequently randomized based * on the given random number generator. * * @param numFolds the number of folds in the cross-validation. Must * be greater than 1. * @param numFold 0 for the first fold, 1 for the second, ... * @param random the random number generator * @return the training set * @throws IllegalArgumentException if the number of folds is less than 2 * or greater than the number of instances. */ //@ requires 2 <= numFolds && numFolds < numInstances(); //@ requires 0 <= numFold && numFold < numFolds; public Instances trainCV(int numFolds, int numFold, Random random) { Instances train = trainCV(numFolds, numFold); train.randomize(random); return train; } /** * Computes the variance for a numeric attribute. * * @param attIndex the numeric attribute (index starts with 0) * @return the variance if the attribute is numeric * @throws IllegalArgumentException if the attribute is not numeric */ public /*@pure@*/ double variance(int attIndex) { double sum = 0, sumSquared = 0, sumOfWeights = 0; if (!attribute(attIndex).isNumeric()) { throw new IllegalArgumentException("Can't compute variance because attribute is " + "not numeric!"); } for (int i = 0; i < numInstances(); i++) { if (!instance(i).isMissing(attIndex)) { sum += instance(i).weight() * instance(i).value(attIndex); sumSquared += instance(i).weight() * instance(i).value(attIndex) * instance(i).value(attIndex); sumOfWeights += instance(i).weight(); } } if (sumOfWeights <= 1) { return 0; } double result = (sumSquared - (sum * sum / sumOfWeights)) / (sumOfWeights - 1); // We don't like negative variance if (result < 0) { return 0; } else { return result; } } /** * Computes the variance for a numeric attribute. * * @param att the numeric attribute * @return the variance if the attribute is numeric * @throws IllegalArgumentException if the attribute is not numeric */ public /*@pure@*/ double variance(Attribute att) { return variance(att.index()); } /** * Calculates summary statistics on the values that appear in this * set of instances for a specified attribute. * * @param index the index of the attribute to summarize (index starts with 0) * @return an AttributeStats object with it's fields calculated. */ //@ requires 0 <= index && index < numAttributes(); public AttributeStats attributeStats(int index) { AttributeStats result = new AttributeStats(); if (attribute(index).isNominal()) { result.nominalCounts = new int [attribute(index).numValues()]; } if (attribute(index).isNumeric()) { result.numericStats = new weka.experiment.Stats(); } result.totalCount = numInstances(); double [] attVals = attributeToDoubleArray(index); int [] sorted = Utils.sort(attVals); int currentCount = 0; double prev = Instance.missingValue(); for (int j = 0; j < numInstances(); j++) { Instance current = instance(sorted[j]); if (current.isMissing(index)) { result.missingCount = numInstances() - j; break; } if (current.value(index) == prev) { currentCount++; } else { result.addDistinct(prev, currentCount); currentCount = 1; prev = current.value(index); } } result.addDistinct(prev, currentCount); result.distinctCount--; // So we don't count "missing" as a value return result; } /** * Gets the value of all instances in this dataset for a particular * attribute. Useful in conjunction with Utils.sort to allow iterating * through the dataset in sorted order for some attribute. * * @param index the index of the attribute. * @return an array containing the value of the desired attribute for * each instance in the dataset. */ //@ requires 0 <= index && index < numAttributes(); public /*@pure@*/ double [] attributeToDoubleArray(int index) { double [] result = new double[numInstances()]; for (int i = 0; i < result.length; i++) { result[i] = instance(i).value(index); } return result; } /** * Generates a string summarizing the set of instances. Gives a breakdown * for each attribute indicating the number of missing/discrete/unique * values and other information. * * @return a string summarizing the dataset */ public String toSummaryString() { StringBuffer result = new StringBuffer(); result.append("Relation Name: ").append(relationName()).append('\n'); result.append("Num Instances: ").append(numInstances()).append('\n'); result.append("Num Attributes: ").append(numAttributes()).append('\n'); result.append('\n'); result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25)); result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5)); result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5)); result.append(Utils.padLeft("Missing", 12)); result.append(Utils.padLeft("Unique", 12)); result.append(Utils.padLeft("Dist", 6)).append('\n'); for (int i = 0; i < numAttributes(); i++) { Attribute a = attribute(i); AttributeStats as = attributeStats(i); result.append(Utils.padLeft("" + (i + 1), 4)).append(' '); result.append(Utils.padRight(a.name(), 25)).append(' '); long percent; switch (a.type()) { case Attribute.NOMINAL: result.append(Utils.padLeft("Nom", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.NUMERIC: result.append(Utils.padLeft("Num", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.DATE: result.append(Utils.padLeft("Dat", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.STRING: result.append(Utils.padLeft("Str", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.RELATIONAL: result.append(Utils.padLeft("Rel", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; default: result.append(Utils.padLeft("???", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; } result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /"); percent = Math.round(100.0 * as.missingCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /"); percent = Math.round(100.0 * as.uniqueCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' '); result.append('\n'); } return result.toString(); } /** * Copies instances from one set to the end of another * one. * * @param from the position of the first instance to be copied * @param dest the destination for the instances * @param num the number of instances to be copied */ //@ requires 0 <= from && from <= numInstances() - num; //@ requires 0 <= num; protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) { for (int i = 0; i < num; i++) { dest.add(instance(from + i)); } } /** * Replaces the attribute information by a clone of * itself. */ protected void freshAttributeInfo() { m_Attributes = (FastVector) m_Attributes.copyElements(); } /** * Returns string including all instances, their weights and * their indices in the original dataset. * * @return description of instance and its weight as a string */ protected /*@pure@*/ String instancesAndWeights(){ StringBuffer text = new StringBuffer(); for (int i = 0; i < numInstances(); i++) { text.append(instance(i) + " " + instance(i).weight()); if (i < numInstances() - 1) { text.append("\n"); } } return text.toString(); } /** * Partitions the instances around a pivot. Used by quicksort and * kthSmallestValue. * * @param attIndex the attribute's index (index starts with 0) * @param l the first index of the subset (index starts with 0) * @param r the last index of the subset (index starts with 0) * * @return the index of the middle element */ //@ requires 0 <= attIndex && attIndex < numAttributes(); //@ requires 0 <= left && left <= right && right < numInstances(); protected int partition(int attIndex, int l, int r) { double pivot = instance((l + r) / 2).value(attIndex); while (l < r) { while ((instance(l).value(attIndex) < pivot) && (l < r)) { l++; } while ((instance(r).value(attIndex) > pivot) && (l < r)) { r--; } if (l < r) { swap(l, r); l++; r--; } } if ((l == r) && (instance(r).value(attIndex) > pivot)) { r--; } return r; } /** * Implements quicksort according to Manber's "Introduction to * Algorithms". * * @param attIndex the attribute's index (index starts with 0) * @param left the first index of the subset to be sorted (index starts with 0) * @param right the last index of the subset to be sorted (index starts with 0) */ //@ requires 0 <= attIndex && attIndex < numAttributes(); //@ requires 0 <= first && first <= right && right < numInstances(); protected void quickSort(int attIndex, int left, int right) { if (left < right) { int middle = partition(attIndex, left, right); quickSort(attIndex, left, middle); quickSort(attIndex, middle + 1, right); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -