📄 instances.java
字号:
/**
*
* AgentAcademy - an open source Data Mining framework for
* training intelligent agents
*
* Copyright (C) 2001-2003 AA Consortium.
*
* This library is open source software; you can redistribute it
* and/or modify it under the terms of the GNU Lesser General
* Public License as published by the Free Software Foundation;
* either version 2.0 of the License, or (at your option) any later
* version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*/
package org.agentacademy.modules.dataminer.core;
/**
* <p>Title: The Data Miner prototype</p>
* <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
* <p>Copyright: Copyright (c) 2002</p>
* <p>Company: CERTH</p>
* @author asymeon
* @version 0.3
*/
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.io.StreamTokenizer;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.jdom.Document;
import org.jdom.Element;
import org.apache.log4j.Logger;
/**
* Class for handling an ordered set of weighted instances. <p>
*
* Typical usage (code from the main() method of this class): <p>
*
* <code>
* ... <br>
*
* // Read all the instances in the file <br>
* reader = new FileReader(filename); <br>
* instances = new Instances(reader); <br><br>
*
* // Make the last attribute be the class <br>
* instances.setClassIndex(instances.numAttributes() - 1); <br><br>
*
* // Print header and instances. <br>
* System.out.println("\nDataset:\n"); <br>
* System.out.println(instances); <br><br>
*
* ... <br>
* </code><p>
*
* All methods that change a set of instances are safe, ie. a change
* of a set of instances does not affect any other sets of
* instances. All methods that change a datasets's attribute
* information clone the dataset before it is changed.
*
*/
public class Instances implements Serializable {
public static Logger log = Logger.getLogger(Instances.class);
/** The filename extension that should be used for arff files */
public static String FILE_EXTENSION = ".arff";
//added by zhoubin to allow XML files to be recognized
public static String FILE_EXTENSION_XML = ".xml";
/** The dataset's name. */
private String m_RelationName;
/** The attribute information. */
private FastVector m_Attributes;
/** The instances. */
private FastVector m_Instances;
/** The class attribute's index */
private int m_ClassIndex;
/** Buffer of values for sparse instance */
private double[] m_ValueBuffer;
/** Buffer of indices for sparse instance */
private int[] m_IndicesBuffer;
/**
* Reads an ARFF file from a reader, and assigns a weight of
* one to each instance. Lets the index of the class
* attribute be undefined (negative).
*
* @param reader the reader
* @exception Exception if the ARFF file is not read
* successfully
*/
public Instances(Reader reader) throws Exception {
StreamTokenizer tokenizer;
tokenizer = new StreamTokenizer(reader);
initTokenizer(tokenizer);
readHeader(tokenizer);
m_ClassIndex = -1;
m_Instances = new FastVector(1000);
while (getInstance(tokenizer, true)) {};
compactify();
}
/**
*Reads the header of an XML file from and
* reserves space for the given number of instances. Lets
* the class index be undefined (negative).
*
* @param document the document
* @param capacity the capacity
* @exception Exception if the header is not read successfully
* or the capacity is not positive or zero
*/
public Instances (Document document, int capacity) throws Exception {
if (capacity < 0) {
throw new Exception("Capacity has to be positive!");
}
readHeader(document);
m_ClassIndex=-1;
m_Instances = new FastVector(capacity);
}
/**
* Reads an XML file from a document, and assigns a weight of
* one to each instance. Lets the index of the class
* attribute be undefined (negative).
*
* @param document the org.jdom.document object
* @exception Exception if the XML file is not read
* successfully, eg. not well-formed
* by asymeon
*/
public Instances(Document document) {
try{
// Read and parse the attributes part of an XML document
readHeader (document);
int m_ClassIndex = -1;
m_Instances = new FastVector(1000);
FastVector m_Instances = new FastVector(1000);
// Check if any attributes have been declared.
if (m_Attributes.size() == 0) {
log.error("An error has occured : No header information are available");
}
Document documentInstances = document;
Element docElement = documentInstances.getRootElement();
// System.out.println(docElement.toString());
if (docElement.getName().compareTo("INSTANCES")==0){
m_RelationName = (docElement.getAttributeValue("title"));
}
else {
log.error("An error has occured :Document root relation expected");
}
// Check if the file contains exactly one data (instances) part
List dataList = docElement.getChildren("DATA");
// System.out.println(dataList.size());
// System.out.println(dataList.toString());
if (dataList.size()!=1) {
log.error("An error has occured: Bad Data of the document");
return;
}
// Check if the data part is empty
Element dataElement = docElement.getChild("DATA");
List tupleList = dataElement.getChildren("TUPLE");
if (tupleList.size()<1) {
log.error("An error has occured: No Data supplied in the document");
return;
}
// Now, get the instance by their index in the tuples
Iterator tupleIterator = tupleList.iterator();
while (tupleIterator.hasNext()){
Element tupleContent = (Element) tupleIterator.next();
getInstance(tupleContent);
}
compactify();
}
catch (Exception e){
log.error( "Error on Instances class has occured: " + e);
e.printStackTrace();
}
}
/**
* Reads the header of an ARFF file from a reader and
* reserves space for the given number of instances. Lets
* the class index be undefined (negative).
*
* @param reader the reader
* @param capacity the capacity
* @exception Exception if the header is not read successfully
* or the capacity is not positive or zero
*/
public Instances(Reader reader, int capacity)
throws Exception {
StreamTokenizer tokenizer;
if (capacity < 0) {
throw new Exception("Capacity has to be positive!");
}
tokenizer = new StreamTokenizer(reader);
initTokenizer(tokenizer);
readHeader(tokenizer);
m_ClassIndex = -1;
m_Instances = new FastVector(capacity);
}
/**
* Constructor copying all instances and references to
* the header information from the given set of instances.
*
* @param instances the set to be copied
*/
public Instances(Instances dataset) {
this(dataset, dataset.numInstances());
dataset.copyInstances(0, this, dataset.numInstances());
}
/**
* Constructor creating an empty set of instances. Copies references
* to the header information from the given set of instances. Sets
* the capacity of the set of instances to 0 if its negative.
*
* @param instances the instances from which the header
* information is to be taken
* @param capacity the capacity of the new dataset
*/
public Instances(Instances dataset, int capacity) {
if (capacity < 0) {
capacity = 0;
}
// Strings only have to be "shallow" copied because
// they can't be modified.
m_ClassIndex = dataset.m_ClassIndex;
m_RelationName = dataset.m_RelationName;
m_Attributes = dataset.m_Attributes;
m_Instances = new FastVector(capacity);
}
/**
* Creates a new set of instances by copying a
* subset of another set.
*
* @param source the set of instances from which a subset
* is to be created
* @param first the index of the first instance to be copied
* @param toCopy the number of instances to be copied
* @exception Exception if first and toCopy are out of range
*/
public Instances(Instances source, int first, int toCopy)
throws Exception {
this(source, toCopy);
if ((first < 0) || ((first + toCopy) > source.numInstances())) {
throw new Exception("Parameters first and/or toCopy out "+
"of range");
}
source.copyInstances(first, this, toCopy);
}
/**
* Creates an empty set of instances. Uses the given
* attribute information. Sets the capacity of the set of
* instances to 0 if its negative. Given attribute information
* must not be changed after this constructor has been used.
*
* @param name the name of the relation
* @param attInfo the attribute information
* @param capacity the capacity of the set
*/
public Instances(String name, FastVector attInfo, int capacity) {
m_RelationName = name;
m_ClassIndex = -1;
m_Attributes = attInfo;
for (int i = 0; i < numAttributes(); i++) {
attribute(i).setIndex(i);
}
m_Instances = new FastVector(capacity);
}
/**
* Create a copy of the structure, but "cleanse" string types (i.e.
* doesn't contain references to the strings seen in the past).
*
* @return a copy of the instance structure.
*/
public Instances stringFreeStructure() {
FastVector atts = (FastVector)m_Attributes.copy();
for (int i = 0 ; i < atts.size(); i++) {
Attribute att = (Attribute)atts.elementAt(i);
if (att.type() == Attribute.STRING) {
atts.setElementAt(new Attribute(att.name(), (FastVector)null), i);
}
}
Instances result = new Instances(relationName(), atts, 0);
result.m_ClassIndex = m_ClassIndex;
return result;
}
/**
* Adds one instance to the end of the set.
* Shallow copies instance before it is added. Increases the
* size of the dataset if it is not large enough. Does not
* check if the instance is compatible with the dataset.
*
* @param instance the instance to be added
*/
public final void add(Instance instance) {
Instance newInstance = (Instance)instance.copy();
newInstance.setDataset(this);
if (m_Instances==null) {
log.error("m_Instances has null value!!!!!!!!");
}
m_Instances.addElement(newInstance);
/*
System.out.println("Mpika");
Instance newInstance = new Instance(instance);
System.out.println("Ekana");
newInstance.setDataset(this);
System.out.println("3anakana");
m_Instances.addElement(newInstance);
System.out.println("Bgika");
*/
}
/**
* Returns an attribute.
*
* @param index the attribute's index
* @return the attribute at the given position
*/
public final Attribute attribute(int index) {
return (Attribute) m_Attributes.elementAt(index);
}
/**
* Returns an attribute given its name. If there is more than
* one attribute with the same name, it returns the first one.
* Returns null if the attribute can't be found.
*
* @param name the attribute's name
* @return the attribute with the given name, null if the
* attribute can't be found
*/
public final Attribute attribute(String name) {
for (int i = 0; i < numAttributes(); i++) {
if (attribute(i).name().equals(name)) {
return attribute(i);
}
}
return null;
}
/**
* Checks for string attributes in the dataset
*
* @return true if string attributes are present, false otherwise
*/
public boolean checkForStringAttributes() {
int i = 0;
while (i < m_Attributes.size()) {
if (attribute(i++).isString()) {
return true;
}
}
return false;
}
/**
* Checks if the given instance is compatible
* with this dataset. Only looks at the size of
* the instance and the ranges of the values for
* nominal and string attributes.
*
* @return true if the instance is compatible with the dataset
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -