📄 instances.java
字号:
if (i < numInstances() - 1) { text.append('\n'); } } return text.toString(); } /** * Creates the training set for one fold of a cross-validation * on the dataset. * * @param numFolds the number of folds in the cross-validation. Must * be greater than 1. * @param numFold 0 for the first fold, 1 for the second, ... * @return the training set * @throws IllegalArgumentException if the number of folds is less than 2 * or greater than the number of instances. */ //@ requires 2 <= numFolds && numFolds < numInstances(); //@ requires 0 <= numFold && numFold < numFolds; public Instances trainCV(int numFolds, int numFold) { int numInstForFold, first, offset; Instances train; if (numFolds < 2) { throw new IllegalArgumentException("Number of folds must be at least 2!"); } if (numFolds > numInstances()) { throw new IllegalArgumentException("Can't have more folds than instances!"); } numInstForFold = numInstances() / numFolds; if (numFold < numInstances() % numFolds) { numInstForFold++; offset = numFold; }else offset = numInstances() % numFolds; train = new Instances(this, numInstances() - numInstForFold); first = numFold * (numInstances() / numFolds) + offset; copyInstances(0, train, first); copyInstances(first + numInstForFold, train, numInstances() - first - numInstForFold); return train; } /** * Creates the training set for one fold of a cross-validation * on the dataset. The data is subsequently randomized based * on the given random number generator. * * @param numFolds the number of folds in the cross-validation. Must * be greater than 1. * @param numFold 0 for the first fold, 1 for the second, ... * @param random the random number generator * @return the training set * @throws IllegalArgumentException if the number of folds is less than 2 * or greater than the number of instances. */ //@ requires 2 <= numFolds && numFolds < numInstances(); //@ requires 0 <= numFold && numFold < numFolds; public Instances trainCV(int numFolds, int numFold, Random random) { Instances train = trainCV(numFolds, numFold); train.randomize(random); return train; } /** * Computes the variance for a numeric attribute. * * @param attIndex the numeric attribute (index starts with 0) * @return the variance if the attribute is numeric * @throws IllegalArgumentException if the attribute is not numeric */ public /*@pure@*/ double variance(int attIndex) { double sum = 0, sumSquared = 0, sumOfWeights = 0; if (!attribute(attIndex).isNumeric()) { throw new IllegalArgumentException("Can't compute variance because attribute is " + "not numeric!"); } for (int i = 0; i < numInstances(); i++) { if (!instance(i).isMissing(attIndex)) { sum += instance(i).weight() * instance(i).value(attIndex); sumSquared += instance(i).weight() * instance(i).value(attIndex) * instance(i).value(attIndex); sumOfWeights += instance(i).weight(); } } if (sumOfWeights <= 1) { return 0; } double result = (sumSquared - (sum * sum / sumOfWeights)) / (sumOfWeights - 1); // We don't like negative variance if (result < 0) { return 0; } else { return result; } } /** * Computes the variance for a numeric attribute. * * @param att the numeric attribute * @return the variance if the attribute is numeric * @throws IllegalArgumentException if the attribute is not numeric */ public /*@pure@*/ double variance(Attribute att) { return variance(att.index()); } /** * Calculates summary statistics on the values that appear in this * set of instances for a specified attribute. * * @param index the index of the attribute to summarize (index starts with 0) * @return an AttributeStats object with it's fields calculated. */ //@ requires 0 <= index && index < numAttributes(); public AttributeStats attributeStats(int index) { AttributeStats result = new AttributeStats(); if (attribute(index).isNominal()) { result.nominalCounts = new int [attribute(index).numValues()]; } if (attribute(index).isNumeric()) { result.numericStats = new weka.experiment.Stats(); } result.totalCount = numInstances(); double [] attVals = attributeToDoubleArray(index); int [] sorted = Utils.sort(attVals); int currentCount = 0; double prev = Instance.missingValue(); for (int j = 0; j < numInstances(); j++) { Instance current = instance(sorted[j]); if (current.isMissing(index)) { result.missingCount = numInstances() - j; break; } if (current.value(index) == prev) { currentCount++; } else { result.addDistinct(prev, currentCount); currentCount = 1; prev = current.value(index); } } result.addDistinct(prev, currentCount); result.distinctCount--; // So we don't count "missing" as a value return result; } /** * Gets the value of all instances in this dataset for a particular * attribute. Useful in conjunction with Utils.sort to allow iterating * through the dataset in sorted order for some attribute. * * @param index the index of the attribute. * @return an array containing the value of the desired attribute for * each instance in the dataset. */ //@ requires 0 <= index && index < numAttributes(); public /*@pure@*/ double [] attributeToDoubleArray(int index) { double [] result = new double[numInstances()]; for (int i = 0; i < result.length; i++) { result[i] = instance(i).value(index); } return result; } /** * Generates a string summarizing the set of instances. Gives a breakdown * for each attribute indicating the number of missing/discrete/unique * values and other information. * * @return a string summarizing the dataset */ public String toSummaryString() { StringBuffer result = new StringBuffer(); result.append("Relation Name: ").append(relationName()).append('\n'); result.append("Num Instances: ").append(numInstances()).append('\n'); result.append("Num Attributes: ").append(numAttributes()).append('\n'); result.append('\n'); result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25)); result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5)); result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5)); result.append(Utils.padLeft("Missing", 12)); result.append(Utils.padLeft("Unique", 12)); result.append(Utils.padLeft("Dist", 6)).append('\n'); for (int i = 0; i < numAttributes(); i++) { Attribute a = attribute(i); AttributeStats as = attributeStats(i); result.append(Utils.padLeft("" + (i + 1), 4)).append(' '); result.append(Utils.padRight(a.name(), 25)).append(' '); long percent; switch (a.type()) { case Attribute.NOMINAL: result.append(Utils.padLeft("Nom", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.NUMERIC: result.append(Utils.padLeft("Num", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.DATE: result.append(Utils.padLeft("Dat", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.STRING: result.append(Utils.padLeft("Str", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; case Attribute.RELATIONAL: result.append(Utils.padLeft("Rel", 4)).append(' '); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; default: result.append(Utils.padLeft("???", 4)).append(' '); result.append(Utils.padLeft("" + 0, 3)).append("% "); percent = Math.round(100.0 * as.intCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); percent = Math.round(100.0 * as.realCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); break; } result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /"); percent = Math.round(100.0 * as.missingCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /"); percent = Math.round(100.0 * as.uniqueCount / as.totalCount); result.append(Utils.padLeft("" + percent, 3)).append("% "); result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' '); result.append('\n'); } return result.toString(); } /** * Reads a single instance using the tokenizer and appends it * to the dataset. Automatically expands the dataset if it * is not large enough to hold the instance. * * @param tokenizer the tokenizer to be used * @param flag if method should test for carriage return after * each instance * @return false if end of file has been reached * @throws IOException if the information is not read * successfully */ protected boolean getInstance(StreamTokenizer tokenizer, boolean flag) throws IOException { // Check if any attributes have been declared. if (m_Attributes.size() == 0) { errms(tokenizer,"no header information available"); } // Check if end of file reached. getFirstToken(tokenizer); if (tokenizer.ttype == StreamTokenizer.TT_EOF) { return false; } // Parse instance if (tokenizer.ttype == '{') { return getInstanceSparse(tokenizer, flag); } else { return getInstanceFull(tokenizer, flag); } } /** * Reads a single instance using the tokenizer and appends it * to the dataset. Automatically expands the dataset if it * is not large enough to hold the instance. * * @param tokenizer the tokenizer to be used * @param flag if method should test for carriage return after * each instance * @return false if end of file has been reached * @throws IOException if the information is not read * successfully */ protected boolean getInstanceSparse(StreamTokenizer tokenizer, boolean flag) throws IOException { int valIndex, numValues = 0, maxIndex = -1; // Get values do { // Get index getIndex(tokenizer); if (tokenizer.ttype == '}') { break; } // Is index valid? try{ m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue(); } catch (NumberFormatException e) { errms(tokenizer,"index number expected"); } if (m_IndicesBuffer[numValues] <= maxIndex) { errms(tokenizer,"indices have to be ordered"); } if ((m_IndicesBuffer[numValues] < 0) || (m_IndicesBuffer[numValues] >= numAttributes())) { errms(tokenizer,"index out of bounds"); } maxIndex = m_IndicesBuffer[numValues]; // Get value; getNextToken(tokenizer); // Check if value is missing. if (tokenizer.ttype == '?') { m_ValueBuffer[numValues] = Instance.missingValue(); } else { // Check if token is valid. if (tokenizer.ttype != StreamTokenizer.TT_WORD) { errms(tokenizer,"not a valid value"); } switch (attribute(m_IndicesBuffer[numValues]).type()) { case Attribute.NOMINAL: // Check if value appears in header. valIndex = attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval); if (valIndex == -1) { errms(tokenizer,"nominal value not declared in header"); } m_ValueBuffer[numValues] = (double)valIndex; break; case Attribute.NUMERIC: // Check if value is really a number. try{ m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval). doubleValue(); } catch (NumberFormatException e) { errms(tokenizer,"number expected"); } break; case Attribute.STRING: m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval); break; case Attribute.DATE: try { m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).parseDate(tokenizer.sval); } catch (ParseException e) { errms(tokenizer,"unparseable date: " + tokenizer.sval); } break; case Attribute.RELATIONAL: StringReader reader = new StringReader(tokenizer.sval); StreamTokenizer innerTokenizer = new StreamTokenizer(reader); initTokenizer(innerTokenizer); Instances data = new Instances(attribute(m_IndicesBuffer[numValues]).relation(), 100); // Allocate buffers in case sparse instances have to be read data.m_ValueBuffer = new double[data.numAttributes()]; data.m_IndicesBuffer = new int[data.numAttributes()]; while (data.getInstance(innerTokenizer, true)) {}; data.compactify(); m_ValueBuffer[numValues] = attribute(m_IndicesBuffer[numValues]).addRelation(data); break; default: errms(tokenizer,"unknown attribute type in column " + m_IndicesBuffer[numValues]); } } numValues++; } while (true); if (flag) { getLastToken(tokenizer,true); } // Add instance to dataset double[] tempValues = new double[numValues]; int[] tempIndices = new int[numValues]; System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues); System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues); add(new SparseInstance(1, tempValues, tempIndices, numAttributes())); return true; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -