📄 instances.java
字号:
/**
* Computes the variance for a numeric attribute.
*
* @param att the numeric attribute
* @return the variance if the attribute is numeric
* @exception IllegalArgumentException if the attribute is not numeric
*/
public final double variance(Attribute att) {
return variance(att.index());
}
/**
* Calculates summary statistics on the values that appear in this
* set of instances for a specified attribute.
*
* @param index the index of the attribute to summarize.
* @return an AttributeStats object with it's fields calculated.
*/
public AttributeStats attributeStats(int index) {
AttributeStats result = new AttributeStats();
if (attribute(index).isNominal()) {
result.nominalCounts = new int [attribute(index).numValues()];
}
if (attribute(index).isNumeric()) {
result.numericStats = new weka.experiment.Stats();
}
result.totalCount = numInstances();
double [] attVals = attributeToDoubleArray(index);
int [] sorted = Utils.sort(attVals);
int currentCount = 0;
double prev = Instance.missingValue();
for (int j = 0; j < numInstances(); j++) {
Instance current = instance(sorted[j]);
if (current.isMissing(index)) {
result.missingCount = numInstances() - j;
break;
}
if (current.value(index) == prev) {
currentCount++;
} else {
result.addDistinct(prev, currentCount);
currentCount = 1;
prev = current.value(index);
}
}
result.addDistinct(prev, currentCount);
result.distinctCount--; // So we don't count "missing" as a value
return result;
}
/**
* Gets the value of all instances in this dataset for a particular
* attribute. Useful in conjunction with Utils.sort to allow iterating
* through the dataset in sorted order for some attribute.
*
* @param index the index of the attribute.
* @return an array containing the value of the desired attribute for
* each instance in the dataset.
*/
public double [] attributeToDoubleArray(int index) {
double [] result = new double[numInstances()];
for (int i = 0; i < result.length; i++) {
result[i] = instance(i).value(index);
}
return result;
}
/**
* Generates a string summarizing the set of instances. Gives a breakdown
* for each attribute indicating the number of missing/discrete/unique
* values and other information.
*
* @return a string summarizing the dataset
*/
public String toSummaryString() {
StringBuffer result = new StringBuffer();
result.append("Relation Name: ").append(relationName()).append('\n');
result.append("Num Instances: ").append(numInstances()).append('\n');
result.append("Num Attributes: ").append(numAttributes()).append('\n');
result.append('\n');
result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
result.append(Utils.padLeft("Missing", 12));
result.append(Utils.padLeft("Unique", 12));
result.append(Utils.padLeft("Dist", 6)).append('\n');
for (int i = 0; i < numAttributes(); i++) {
Attribute a = attribute(i);
AttributeStats as = attributeStats(i);
result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
result.append(Utils.padRight(a.name(), 25)).append(' ');
long percent;
switch (a.type()) {
case Attribute.NOMINAL:
result.append(Utils.padLeft("Nom", 4)).append(' ');
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
case Attribute.NUMERIC:
result.append(Utils.padLeft("Num", 4)).append(' ');
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
case Attribute.DATE:
result.append(Utils.padLeft("Dat", 4)).append(' ');
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
case Attribute.STRING:
result.append(Utils.padLeft("Str", 4)).append(' ');
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
default:
result.append(Utils.padLeft("???", 4)).append(' ');
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
}
result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
percent = Math.round(100.0 * as.missingCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
result.append('\n');
}
return result.toString();
}
/**
* Reads a single instance using the tokenizer and appends it
* to the dataset. Automatically expands the dataset if it
* is not large enough to hold the instance.
*
* @param tokenizer the tokenizer to be used
* @param flag if method should test for carriage return after
* each instance
* @return false if end of file has been reached
* @exception IOException if the information is not read
* successfully
*/
protected boolean getInstance(StreamTokenizer tokenizer,
boolean flag)
throws IOException {
// Check if any attributes have been declared.
if (m_Attributes.size() == 0) {
errms(tokenizer,"no header information available");
}
// Check if end of file reached.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
return false;
}
// Parse instance
if (tokenizer.ttype == '{') {
return getInstanceSparse(tokenizer, flag);
} else {
return getInstanceFull(tokenizer, flag);
}
}
/**
* Reads a single instance using the tokenizer and appends it
* to the dataset. Automatically expands the dataset if it
* is not large enough to hold the instance.
*
* @param tokenizer the tokenizer to be used
* @param flag if method should test for carriage return after
* each instance
* @return false if end of file has been reached
* @exception IOException if the information is not read
* successfully
*/
protected boolean getInstanceSparse(StreamTokenizer tokenizer,
boolean flag)
throws IOException {
int valIndex, numValues = 0, maxIndex = -1;
// Get values
do {
// Get index
getIndex(tokenizer);
if (tokenizer.ttype == '}') {
break;
}
// Is index valid?
try{
m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();
} catch (NumberFormatException e) {
errms(tokenizer,"index number expected");
}
if (m_IndicesBuffer[numValues] <= maxIndex) {
errms(tokenizer,"indices have to be ordered");
}
if ((m_IndicesBuffer[numValues] < 0) ||
(m_IndicesBuffer[numValues] >= numAttributes())) {
errms(tokenizer,"index out of bounds");
}
maxIndex = m_IndicesBuffer[numValues];
// Get value;
getNextToken(tokenizer);
// Check if value is missing.
if (tokenizer.ttype == '?') {
m_ValueBuffer[numValues] = Instance.missingValue();
} else {
// Check if token is valid.
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
errms(tokenizer,"not a valid value");
}
switch (attribute(m_IndicesBuffer[numValues]).type()) {
case Attribute.NOMINAL:
// Check if value appears in header.
valIndex =
attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);
if (valIndex == -1) {
errms(tokenizer,"nominal value not declared in header");
}
m_ValueBuffer[numValues] = (double)valIndex;
break;
case Attribute.NUMERIC:
// Check if value is really a number.
try{
m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).
doubleValue();
} catch (NumberFormatException e) {
errms(tokenizer,"number expected");
}
break;
case Attribute.STRING:
m_ValueBuffer[numValues] =
attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);
break;
case Attribute.DATE:
try {
m_ValueBuffer[numValues] =
attribute(m_IndicesBuffer[numValues]).parseDate(tokenizer.sval);
} catch (ParseException e) {
errms(tokenizer,"unparseable date: " + tokenizer.sval);
}
break;
default:
errms(tokenizer,"unknown attribute type in column " + m_IndicesBuffer[numValues]);
}
}
numValues++;
} while (true);
if (flag) {
getLastToken(tokenizer,true);
}
// Add instance to dataset
double[] tempValues = new double[numValues];
int[] tempIndices = new int[numValues];
System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
return true;
}
/**
* Reads a single instance using the tokenizer and appends it
* to the dataset. Automatically expands the dataset if it
* is not large enough to hold the instance.
*
* @param tokenizer the tokenizer to be used
* @param flag if method should test for carriage return after
* each instance
* @return false if end of file has been reached
* @exception IOException if the information is not read
* successfully
*/
protected boolean getInstanceFull(StreamTokenizer tokenizer,
boolean flag)
throws IOException {
double[] instance = new double[numAttributes()];
int index;
// Get values for all attributes.
for (int i = 0; i < numAttributes(); i++){
// Get next token
if (i > 0) {
getNextToken(tokenizer);
}
// Check if value is missing.
if (tokenizer.ttype == '?') {
instance[i] = Instance.missingValue();
} else {
// Check if token is valid.
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
errms(tokenizer,"not a valid value");
}
switch (attribute(i).type()) {
case Attribute.NOMINAL:
// Check if value appears in header.
index = attribute(i).indexOfValue(tokenizer.sval);
if (index == -1) {
errms(tokenizer,"nominal value not declared in header");
}
instance[i] = (double)index;
break;
case Attribute.NUMERIC:
// Check if value is really a number.
try{
instance[i] = Double.valueOf(tokenizer.sval).
doubleValue();
} catch (NumberFormatException e) {
errms(tokenizer,"number expected");
}
break;
case Attribute.STRING:
instance[i] = attribute(i).addStringValue(tokenizer.sval);
break;
case Attribute.DATE:
try {
instance[i] = attribute(i).parseDate(tokenizer.sval);
} catch (ParseException e) {
errms(tokenizer,"unparseable date: " + tokenizer.sval);
}
break;
default:
errms(tokenizer,"unknown attribute type in column " + i);
}
}
}
if (flag) {
getLastToken(tokenizer,true);
}
// Add instance to dataset
add(new Instance(1, instance));
return true;
}
/**
* Reads and stores header of an ARFF file.
*
* @param tokenizer the stream tokenizer
* @exception IOException if the information is not read
* successfully
*/
protected void readHeader(StreamTokenizer tokenizer)
throws IOException {
String attributeName;
FastVector attributeValues;
// Get name of relation.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
}
if (ARFF_RELATION.equalsIgnoreCase(tokenizer.sval)) {
getNextToken(tokenizer);
m_RelationName = tokenizer.sval;
getLastToken(tokenizer,false);
} else {
errms(tokenizer,"keyword " + ARFF_RELATION + " expected");
}
// Create vectors to hold information temporarily.
m_Attributes = new FastVector();
// Get attribute declarations.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
}
while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) {
// Get attribute name.
getNextToken(tokenizer);
attributeName = tokenizer.sval;
getNextToken(tokenizer);
// Check if attribute is nominal.
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
// Attribute is real, integer, or string.
if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) ||
tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) ||
tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) {
m_Attributes.addElement(new Attribute(attributeName, numAttributes()));
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) {
m_Attributes.
addElement(new Attribute(attributeName, (FastVector)null,
numAttributes()));
readTillEOL(tokenizer);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -