📄 instances.java
字号:
}
if (numFolds > numInstances()) {
throw new Exception("Can't have more folds than instances!");
}
numInstForFold = numInstances() / numFolds;
if (numFold < numInstances() % numFolds) {
numInstForFold++;
offset = numFold;
}else
offset = numInstances() % numFolds;
train = new Instances(this, numInstances() - numInstForFold);
first = numFold * (numInstances() / numFolds) + offset;
copyInstances(0, train, first);
copyInstances(first + numInstForFold, train,
numInstances() - first - numInstForFold);
return train;
}
/**
* Computes the variance for a numeric attribute.
*
* @param attIndex the numeric attribute
* @return the variance if the attribute is numeric
* @exception Exception if the attribute is not numeric
*/
public final double variance(int attIndex) throws Exception {
double sum = 0, sumSquared = 0, sumOfWeights = 0;
if (!attribute(attIndex).isNumeric()) {
throw new Exception("Can't compute variance because attribute is " +
"not numeric!");
}
for (int i = 0; i < numInstances(); i++) {
if (!instance(i).isMissing(attIndex)) {
sum += instance(i).weight() *
instance(i).value(attIndex);
sumSquared += instance(i).weight() *
instance(i).value(attIndex) *
instance(i).value(attIndex);
sumOfWeights += instance(i).weight();
}
}
if (Utils.smOrEq(sumOfWeights, 1)) {
return 0;
}
return (sumSquared - (sum * sum / sumOfWeights)) /
(sumOfWeights - 1);
}
/**
* Computes the variance for a numeric attribute.
*
* @param att the numeric attribute
* @return the variance if the attribute is numeric
* @exception Exception if the attribute is not numeric
*/
public final double variance(Attribute att) throws Exception {
return variance(att.index());
}
/**
* Calculates summary statistics on the values that appear in this
* set of instances for a specified attribute.
*
* @param index the index of the attribute to summarize.
* @return an AttributeStats object with it's fields calculated.
*/
public AttributeStats attributeStats(int index) {
AttributeStats result = new AttributeStats();
if (attribute(index).isNominal()) {
result.nominalCounts = new int [attribute(index).numValues()];
}
if (attribute(index).isNumeric()) {
result.numericStats = new org.agentacademy.modules.dataminer.experiment.Stats ();
}
result.totalCount = numInstances();
double [] attVals = attributeToDoubleArray(index);
int [] sorted = Utils.sort(attVals);
int currentCount = 0;
double prev = Instance.missingValue();
for (int j = 0; j < numInstances(); j++) {
Instance current = instance(sorted[j]);
if (current.isMissing(index)) {
result.missingCount = numInstances() - j;
break;
}
if (Utils.eq(current.value(index), prev)) {
currentCount++;
} else {
result.addDistinct(prev, currentCount);
currentCount = 1;
prev = current.value(index);
}
}
result.addDistinct(prev, currentCount);
result.distinctCount--; // So we don't count "missing" as a value
return result;
}
/**
* Gets the value of all instances in this dataset for a particular
* attribute. Useful in conjunction with Utils.sort to allow iterating
* through the dataset in sorted order for some attribute.
*
* @param index the index of the attribute.
* @return an array containing the value of the desired attribute for
* each instance in the dataset.
*/
public double [] attributeToDoubleArray(int index) {
double [] result = new double[numInstances()];
for (int i = 0; i < result.length; i++) {
result[i] = instance(i).value(index);
}
return result;
}
/**
* Generates a string summarizing the set of instances. Gives a breakdown
* for each attribute indicating the number of missing/discrete/unique
* values and other information.
*
* @return a string summarizing the dataset
*/
public String toSummaryString() {
StringBuffer result = new StringBuffer();
result.append("Relation Name: ").append(relationName()).append('\n');
result.append("Num Instances: ").append(numInstances()).append('\n');
result.append("Num Attributes: ").append(numAttributes()).append('\n');
result.append('\n');
result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
result.append(Utils.padLeft("Missing", 12));
result.append(Utils.padLeft("Unique", 12));
result.append(Utils.padLeft("Dist", 6)).append('\n');
for (int i = 0; i < numAttributes(); i++) {
Attribute a = attribute(i);
AttributeStats as = attributeStats(i);
result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
result.append(Utils.padRight(a.name(), 25)).append(' ');
long percent;
switch (a.type()) {
case Attribute.NOMINAL:
result.append(Utils.padLeft("Nom", 4)).append(' ');
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
case Attribute.NUMERIC:
result.append(Utils.padLeft("Num", 4)).append(' ');
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
case Attribute.STRING:
result.append(Utils.padLeft("Str", 4)).append(' ');
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
default:
result.append(Utils.padLeft("???", 4)).append(' ');
result.append(Utils.padLeft("" + 0, 3)).append("% ");
percent = Math.round(100.0 * as.intCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
percent = Math.round(100.0 * as.realCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
break;
}
result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
percent = Math.round(100.0 * as.missingCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
result.append(Utils.padLeft("" + percent, 3)).append("% ");
result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
result.append('\n');
}
return result.toString();
}
/**
* Reads a single instance using the tokenizer and appends it
* to the dataset. Automatically expands the dataset if it
* is not large enough to hold the instance.
*
* @param tokenizer the tokenizer to be used
* @param flag if method should test for carriage return after
* each instance
* @return false if end of file has been reached
* @exception IOException if the information is not read
* successfully
*/
protected boolean getInstance(StreamTokenizer tokenizer,
boolean flag)
throws IOException {
// Check if any attributes have been declared.
if (m_Attributes.size() == 0) {
errms(tokenizer,"no header information available");
}
// Check if end of file reached.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
return false;
}
// Parse instance
if (tokenizer.ttype == '{') {
return getInstanceSparse(tokenizer, flag);
} else {
return getInstanceFull(tokenizer, flag);
}
}
/**
* Reads a single instance using the tokenizer and appends it
* to the dataset. Automatically expands the dataset if it
* is not large enough to hold the instance.
*
* @param tokenizer the tokenizer to be used
* @param flag if method should test for carriage return after
* each instance
* @return false if end of file has been reached
* @exception IOException if the information is not read
* successfully
*/
protected boolean getInstanceSparse(StreamTokenizer tokenizer,
boolean flag)
throws IOException {
int valIndex, numValues = 0, maxIndex = -1;
// Get values
do {
// Get index
getIndex(tokenizer);
if (tokenizer.ttype == '}') {
break;
}
// Is index valid?
try{
m_IndicesBuffer[numValues] = Integer.valueOf(tokenizer.sval).intValue();
} catch (NumberFormatException e) {
errms(tokenizer,"index number expected");
}
if (m_IndicesBuffer[numValues] <= maxIndex) {
errms(tokenizer,"indices have to be ordered");
}
if ((m_IndicesBuffer[numValues] < 0) ||
(m_IndicesBuffer[numValues] >= numAttributes())) {
errms(tokenizer,"index out of bounds");
}
maxIndex = m_IndicesBuffer[numValues];
// Get value;
getNextToken(tokenizer);
// Check if value is missing.
if (tokenizer.ttype == '?') {
m_ValueBuffer[numValues] = Instance.missingValue();
} else {
// Check if token is valid.
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
errms(tokenizer,"not a valid value");
}
if (attribute(m_IndicesBuffer[numValues]).isNominal()) {
// Check if value appears in header.
valIndex =
attribute(m_IndicesBuffer[numValues]).indexOfValue(tokenizer.sval);
if (valIndex == -1) {
errms(tokenizer,"nominal value not declared in header");
}
m_ValueBuffer[numValues] = (double)valIndex;
} else if (attribute(m_IndicesBuffer[numValues]).isNumeric()) {
// Check if value is really a number.
try{
m_ValueBuffer[numValues] = Double.valueOf(tokenizer.sval).
doubleValue();
} catch (NumberFormatException e) {
errms(tokenizer,"number expected");
}
} else {
m_ValueBuffer[numValues] =
attribute(m_IndicesBuffer[numValues]).addStringValue(tokenizer.sval);
}
}
numValues++;
} while (true);
if (flag) {
getLastToken(tokenizer,true);
}
// Add instance to dataset
double[] tempValues = new double[numValues];
int[] tempIndices = new int[numValues];
System.arraycopy(m_ValueBuffer, 0, tempValues, 0, numValues);
System.arraycopy(m_IndicesBuffer, 0, tempIndices, 0, numValues);
add(new SparseInstance(1, tempValues, tempIndices, numAttributes()));
return true;
}
/**
* Reads a single instance using the tuples and appends it
* to the dataset. Automatically expands the dataset if it
* is not large enough to hold the instance.
*
* @param tuple the node to be searched
* @return true if everything is ok
* @exception IOException if the information is not read
* successfully
* by asymeon
*/
protected boolean getInstance(Element tuple) throws IOException {
double[] instance = new double[numAttributes()];
int index;
// System.out.println("Instance length= " + instance.length);
Element tupleElement = tuple;
// System.out.println(tupleElement.toString());
List tupleContentList = tupleElement.getChildren("ATTRVALUE");
Iterator attrValueIterator = tupleContentList.iterator();
// Get the ATTRVALUEs in one TUPLE
while (attrValueIterator.hasNext()){
Element attrValueElement = (Element) attrValueIterator.next();
// org.jdom.Attribute attr = attrValueElement.getAttribute("name");
// System.out.println(attrValueElement.toString());
String attrValueName = attrValueElement.getAttributeValue("name");
org.agentacademy.modules.dataminer.core.Attribute coreAttr = attribute(attrValueName);
if (coreAttr == null) {
log.error("An error has occured: The " + attrValueName + " attribute has not been declared in header");
}
else {
int i = coreAttr.index();
// System.out.println("CoreAttr index= " + coreAttr.index());
// System.out.println("CoreAttr=" + coreAttr.toString());
String attrValueText = attrValueElement.getText();
String attrValueName2 = attrValueElement.getName();
// System.out.println("Attribute Value Name= " + attrValueName2);
// System.out.println("Attribute Value Text= " + attrValueText);
if (attrValueText.equalsIgnoreCase("?")) {
instance[i] = Instance.missingValue();
}
else if (coreAttr.isNominal()) {
index = attribute(i).indexOfValue(attrValueText);
if (index == -1) {
log.error("Error occured: The " + attrValueText + " nominal value is not declared in header");
}
instance[i] = (double)index;
}
else if (coreAttr.isNumeric()){
// Check if value is really a number.
try{
instance[i] = Double.valueOf(attrValueText).doubleValue();
}
catch (NumberFormatException e) {
log.error("Error occured: For attribute " + attrValueText + " number expected");
}
}
else {
instance[i] = coreAttr.addStringValue(attrValueText);
}
} //end of if - else
} //end of while
add(new Instance (1 , instance));
return true;
}
/**
* Reads a single instance using the tokenizer and appends it
* to the dataset. Automatically expands the dataset if it
* is not large enough to hold the instance.
*
* @param tokenizer the tokenizer to be used
* @param flag if method should test for carriage return after
* each instance
* @return false if end of file has been reached
* @exception IOException if the information is not read
* successfully
*/
protected boolean getInstanceFull(StreamTokenizer tokenizer,
boolean flag)
throws IOException {
double[] instance = new double[numAttributes()];
int index;
// Get values for all attributes.
for (int i = 0; i < numAttributes(); i++){
// Get next token
if (i > 0) {
getNextToken(tokenizer);
}
// Check if value is missing.
if (tokenizer.ttype == '?') {
instance[i] = Instance.missingValue();
} else {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -