📄 instances.java
字号:
} else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) {
String format = null;
if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
if ((tokenizer.ttype != StreamTokenizer.TT_WORD) &&
(tokenizer.ttype != '\'') &&
(tokenizer.ttype != '\"')) {
errms(tokenizer,"not a valid date format");
}
format = tokenizer.sval;
readTillEOL(tokenizer);
} else {
tokenizer.pushBack();
}
m_Attributes.addElement(new Attribute(attributeName, format,
numAttributes()));
} else {
errms(tokenizer,"no valid attribute type or invalid "+
"enumeration");
}
} else {
// Attribute is nominal.
attributeValues = new FastVector();
tokenizer.pushBack();
// Get values for nominal attribute.
if (tokenizer.nextToken() != '{') {
errms(tokenizer,"{ expected at beginning of enumeration");
}
while (tokenizer.nextToken() != '}') {
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
errms(tokenizer,"} expected at end of enumeration");
} else {
attributeValues.addElement(tokenizer.sval);
}
}
if (attributeValues.size() == 0) {
errms(tokenizer,"no nominal values found");
}
m_Attributes.
addElement(new Attribute(attributeName, attributeValues,
numAttributes()));
}
getLastToken(tokenizer,false);
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF)
errms(tokenizer,"premature end of file");
}
// Check if data part follows. We can't easily check for EOL.
if (!ARFF_DATA.equalsIgnoreCase(tokenizer.sval)) {
errms(tokenizer,"keyword " + ARFF_DATA + " expected");
}
// Check if any attributes have been declared.
if (m_Attributes.size() == 0) {
errms(tokenizer,"no attributes declared");
}
// Allocate buffers in case sparse instances have to be read
m_ValueBuffer = new double[numAttributes()];
m_IndicesBuffer = new int[numAttributes()];
}
/**
* Copies instances from one set to the end of another
* one.
*
* @param source the source of the instances
* @param from the position of the first instance to be copied
* @param dest the destination for the instances
* @param num the number of instances to be copied
*/
private void copyInstances(int from, Instances dest, int num) {
for (int i = 0; i < num; i++) {
dest.add(instance(from + i));
}
}
/**
* Throws error message with line number and last token read.
*
* @param theMsg the error message to be thrown
* @param tokenizer the stream tokenizer
* @throws IOExcpetion containing the error message
*/
private void errms(StreamTokenizer tokenizer, String theMsg)
throws IOException {
throw new IOException(theMsg + ", read " + tokenizer.toString());
}
/**
* Replaces the attribute information by a clone of
* itself.
*/
private void freshAttributeInfo() {
m_Attributes = (FastVector) m_Attributes.copyElements();
}
/**
* Gets next token, skipping empty lines.
*
* @param tokenizer the stream tokenizer
* @exception IOException if reading the next token fails
*/
private void getFirstToken(StreamTokenizer tokenizer)
throws IOException {
while (tokenizer.nextToken() == StreamTokenizer.TT_EOL){};
if ((tokenizer.ttype == '\'') ||
(tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
(tokenizer.sval.equals("?"))){
tokenizer.ttype = '?';
}
}
/**
* Gets index, checking for a premature and of line.
*
* @param tokenizer the stream tokenizer
* @exception IOException if it finds a premature end of line
*/
private void getIndex(StreamTokenizer tokenizer) throws IOException {
if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errms(tokenizer,"premature end of line");
}
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
}
}
/**
* Gets token and checks if its end of line.
*
* @param tokenizer the stream tokenizer
* @exception IOException if it doesn't find an end of line
*/
private void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk)
throws IOException {
if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&
((tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
errms(tokenizer,"end of line expected");
}
}
/**
* Gets next token, checking for a premature and of line.
*
* @param tokenizer the stream tokenizer
* @exception IOException if it finds a premature end of line
*/
private void getNextToken(StreamTokenizer tokenizer)
throws IOException {
if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errms(tokenizer,"premature end of line");
}
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
} else if ((tokenizer.ttype == '\'') ||
(tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
(tokenizer.sval.equals("?"))){
tokenizer.ttype = '?';
}
}
/**
* Initializes the StreamTokenizer used for reading the ARFF file.
*
* @param tokenizer the stream tokenizer
*/
private void initTokenizer(StreamTokenizer tokenizer){
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, ' ');
tokenizer.wordChars(' '+1,'\u00FF');
tokenizer.whitespaceChars(',',',');
tokenizer.commentChar('%');
tokenizer.quoteChar('"');
tokenizer.quoteChar('\'');
tokenizer.ordinaryChar('{');
tokenizer.ordinaryChar('}');
tokenizer.eolIsSignificant(true);
}
/**
* Returns string including all instances, their weights and
* their indices in the original dataset.
*
* @return description of instance and its weight as a string
*/
private String instancesAndWeights(){
StringBuffer text = new StringBuffer();
for (int i = 0; i < numInstances(); i++) {
text.append(instance(i) + " " + instance(i).weight());
if (i < numInstances() - 1) {
text.append("\n");
}
}
return text.toString();
}
/**
* Implements quicksort.
*
* @param attIndex the attribute's index
* @param lo0 the first index of the subset to be sorted
* @param hi0 the last index of the subset to be sorted
*/
private void quickSort(int attIndex, int lo0, int hi0) {
int lo = lo0, hi = hi0;
double mid;
if (hi0 > lo0) {
// Arbitrarily establishing partition element as the
// midpoint of the array.
mid = instance((lo0 + hi0) / 2).value(attIndex);
// loop through the array until indices cross
while(lo <= hi) {
// find the first element that is greater than or equal to
// the partition element starting from the left Index.
while ((instance(lo).value(attIndex) <
mid) && (lo < hi0)) {
++lo;
}
// find an element that is smaller than or equal to
// the partition element starting from the right Index.
while ((instance(hi).value(attIndex) >
mid) && (hi > lo0)) {
--hi;
}
// if the indexes have not crossed, swap
if(lo <= hi) {
swap(lo,hi);
++lo;
--hi;
}
}
// If the right index has not reached the left side of array
// must now sort the left partition.
if(lo0 < hi) {
quickSort(attIndex,lo0,hi);
}
// If the left index has not reached the right side of array
// must now sort the right partition.
if(lo < hi0) {
quickSort(attIndex,lo,hi0);
}
}
}
/**
* Reads and skips all tokens before next end of line token.
*
* @param tokenizer the stream tokenizer
*/
private void readTillEOL(StreamTokenizer tokenizer)
throws IOException {
while (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {};
tokenizer.pushBack();
}
/**
* Help function needed for stratification of set.
*
* @param numFolds the number of folds for the stratification
*/
private void stratStep (int numFolds){
FastVector newVec = new FastVector(m_Instances.capacity());
int start = 0, j;
// create stratified batch
while (newVec.size() < numInstances()) {
j = start;
while (j < numInstances()) {
newVec.addElement(instance(j));
j = j + numFolds;
}
start++;
}
m_Instances = newVec;
}
/**
* Swaps two instances in the set.
*
* @param i the first instance's index
* @param j the second instance's index
*/
private void swap(int i, int j){
m_Instances.swap(i, j);
}
/**
* Merges two sets of Instances together. The resulting set will have
* all the attributes of the first set plus all the attributes of the
* second set. The number of instances in both sets must be the same.
*
* @param first the first set of Instances
* @param second the second set of Instances
* @return the merged set of Instances
* @exception IllegalArgumentException if the datasets are not the same size
*/
public static Instances mergeInstances(Instances first, Instances second) {
if (first.numInstances() != second.numInstances()) {
throw new IllegalArgumentException("Instance sets must be of the same size");
}
// Create the vector of merged attributes
FastVector newAttributes = new FastVector();
for (int i = 0; i < first.numAttributes(); i++) {
newAttributes.addElement(first.attribute(i));
}
for (int i = 0; i < second.numAttributes(); i++) {
newAttributes.addElement(second.attribute(i));
}
// Create the set of Instances
Instances merged = new Instances(first.relationName() + '_'
+ second.relationName(),
newAttributes,
first.numInstances());
// Merge each instance
for (int i = 0; i < first.numInstances(); i++) {
merged.add(first.instance(i).mergeInstance(second.instance(i)));
}
return merged;
}
/**
* Method for testing this class.
*
* @param argv should contain one element: the name of an ARFF file
*/
public static void test(String [] argv) {
Instances instances, secondInstances, train, test, empty;
Random random = new Random(2);
Reader reader;
int start, num;
FastVector testAtts, testVals;
int i,j;
try{
if (argv.length > 1) {
throw (new Exception("Usage: Instances [<filename>]"));
}
// Creating set of instances from scratch
testVals = new FastVector(2);
testVals.addElement("first_value");
testVals.addElement("second_value");
testAtts = new FastVector(2);
testAtts.addElement(new Attribute("nominal_attribute", testVals));
testAtts.addElement(new Attribute("numeric_attribute"));
instances = new Instances("test_set", testAtts, 10);
instances.add(new Instance(instances.numAttributes()));
instances.add(new Instance(instances.numAttributes()));
instances.add(new Instance(instances.numAttributes()));
instances.setClassIndex(0);
System.out.println("\nSet of instances created from scratch:\n");
System.out.println(instances);
if (argv.length == 1) {
String filename = argv[0];
reader = new FileReader(filename);
// Read first five instances and print them
System.out.println("\nFirst five instances from file:\n");
instances = new Instances(reader, 1);
instances.setClassIndex(instances.numAttributes() - 1);
i = 0;
while ((i < 5) && (instances.readInstance(reader))) {
i++;
}
System.out.println(instances);
// Read all the instances in the file
reader = new FileReader(filename);
instances = new Instances(reader);
// Make the last attribute be the class
instances.setClassIndex(instances.numAttributes() - 1);
// Print header and instances.
System.out.println("\nDataset:\n");
System.out.println(instances);
System.out.println("\nClass index: "+instances.classIndex());
}
// Test basic methods based on class index.
System.out.println("\nClass name: "+instances.classAttribute().name());
System.out.println("\nClass index: "+instances.classIndex());
System.out.println("\nClass is nominal: " +
instances.classAttribute().isNominal());
System.out.println("\nClass is numeric: " +
instances.classAttribute().isNumeric());
System.out.println("\nClasse
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -