📄 instances.java
字号:
}
/**
* Reads and stores header of an ARFF file.
*
* @param tokenizer the stream tokenizer
* @exception IOException if the information is not read
* successfully
*/
protected void readHeader(StreamTokenizer tokenizer)
throws IOException {
String attributeName;
FastVector attributeValues;
int i;
// Get name of relation.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
}
if (ARFF_RELATION.equalsIgnoreCase(tokenizer.sval)) {
getNextToken(tokenizer);
m_RelationName = tokenizer.sval;
getLastToken(tokenizer,false);
} else {
errms(tokenizer,"keyword " + ARFF_RELATION + " expected");
}
// Create vectors to hold information temporarily.
m_Attributes = new FastVector();
// Get attribute declarations.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
}
while (Attribute.ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) {
// Get attribute name.
getNextToken(tokenizer);
attributeName = tokenizer.sval;
getNextToken(tokenizer);
// Check if attribute is nominal.
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
// Attribute is real, integer, or string.
if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_REAL) ||
tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_INTEGER) ||
tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_NUMERIC)) {
m_Attributes.addElement(new Attribute(attributeName, numAttributes()));
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_STRING)) {
m_Attributes.
addElement(new Attribute(attributeName, (FastVector)null,
numAttributes()));
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(Attribute.ARFF_ATTRIBUTE_DATE)) {
String format = null;
if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
if ((tokenizer.ttype != StreamTokenizer.TT_WORD) &&
(tokenizer.ttype != '\'') &&
(tokenizer.ttype != '\"')) {
errms(tokenizer,"not a valid date format");
}
format = tokenizer.sval;
readTillEOL(tokenizer);
} else {
tokenizer.pushBack();
}
m_Attributes.addElement(new Attribute(attributeName, format,
numAttributes()));
} else {
errms(tokenizer,"no valid attribute type or invalid "+
"enumeration");
}
} else {
// Attribute is nominal.
attributeValues = new FastVector();
tokenizer.pushBack();
// Get values for nominal attribute.
if (tokenizer.nextToken() != '{') {
errms(tokenizer,"{ expected at beginning of enumeration");
}
while (tokenizer.nextToken() != '}') {
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
errms(tokenizer,"} expected at end of enumeration");
} else {
attributeValues.addElement(tokenizer.sval);
}
}
if (attributeValues.size() == 0) {
errms(tokenizer,"no nominal values found");
}
m_Attributes.
addElement(new Attribute(attributeName, attributeValues,
numAttributes()));
}
getLastToken(tokenizer,false);
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF)
errms(tokenizer,"premature end of file");
}
// Check if data part follows. We can't easily check for EOL.
if (!ARFF_DATA.equalsIgnoreCase(tokenizer.sval)) {
errms(tokenizer,"keyword " + ARFF_DATA + " expected");
}
// Check if any attributes have been declared.
if (m_Attributes.size() == 0) {
errms(tokenizer,"no attributes declared");
}
// Allocate buffers in case sparse instances have to be read
m_ValueBuffer = new double[numAttributes()];
m_IndicesBuffer = new int[numAttributes()];
}
/**
* Copies instances from one set to the end of another
* one.
*
* @param source the source of the instances
* @param from the position of the first instance to be copied
* @param dest the destination for the instances
* @param num the number of instances to be copied
*/
//@ requires 0 <= from && from <= numInstances() - num;
//@ requires 0 <= num;
protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) {
for (int i = 0; i < num; i++) {
dest.add(instance(from + i));
}
}
/**
* Throws error message with line number and last token read.
*
* @param theMsg the error message to be thrown
* @param tokenizer the stream tokenizer
* @throws IOExcpetion containing the error message
*/
protected void errms(StreamTokenizer tokenizer, String theMsg)
throws IOException {
throw new IOException(theMsg + ", read " + tokenizer.toString());
}
/**
* Replaces the attribute information by a clone of
* itself.
*/
protected void freshAttributeInfo() {
m_Attributes = (FastVector) m_Attributes.copyElements();
}
/**
* Gets next token, skipping empty lines.
*
* @param tokenizer the stream tokenizer
* @exception IOException if reading the next token fails
*/
protected void getFirstToken(StreamTokenizer tokenizer)
throws IOException {
while (tokenizer.nextToken() == StreamTokenizer.TT_EOL){};
if ((tokenizer.ttype == '\'') ||
(tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
(tokenizer.sval.equals("?"))){
tokenizer.ttype = '?';
}
}
/**
* Gets index, checking for a premature and of line.
*
* @param tokenizer the stream tokenizer
* @exception IOException if it finds a premature end of line
*/
protected void getIndex(StreamTokenizer tokenizer) throws IOException {
if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errms(tokenizer,"premature end of line");
}
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
}
}
/**
* Gets token and checks if its end of line.
*
* @param tokenizer the stream tokenizer
* @exception IOException if it doesn't find an end of line
*/
protected void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk)
throws IOException {
if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) &&
((tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
errms(tokenizer,"end of line expected");
}
}
/**
* Gets next token, checking for a premature and of line.
*
* @param tokenizer the stream tokenizer
* @exception IOException if it finds a premature end of line
*/
protected void getNextToken(StreamTokenizer tokenizer)
throws IOException {
if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
errms(tokenizer,"premature end of line");
}
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
errms(tokenizer,"premature end of file");
} else if ((tokenizer.ttype == '\'') ||
(tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) &&
(tokenizer.sval.equals("?"))){
tokenizer.ttype = '?';
}
}
/**
* Initializes the StreamTokenizer used for reading the ARFF file.
*
* @param tokenizer the stream tokenizer
*/
protected void initTokenizer(StreamTokenizer tokenizer){
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, ' ');
tokenizer.wordChars(' '+1,'\u00FF');
tokenizer.whitespaceChars(',',',');
tokenizer.commentChar('%');
tokenizer.quoteChar('"');
tokenizer.quoteChar('\'');
tokenizer.ordinaryChar('{');
tokenizer.ordinaryChar('}');
tokenizer.eolIsSignificant(true);
}
/**
* Returns string including all instances, their weights and
* their indices in the original dataset.
*
* @return description of instance and its weight as a string
*/
protected /*@pure@*/ String instancesAndWeights(){
StringBuffer text = new StringBuffer();
for (int i = 0; i < numInstances(); i++) {
text.append(instance(i) + " " + instance(i).weight());
if (i < numInstances() - 1) {
text.append("\n");
}
}
return text.toString();
}
/**
* Partitions the instances around a pivot. Used by quicksort and
* kthSmallestValue.
*
* @param attIndex the attribute's index
* @param left the first index of the subset
* @param right the last index of the subset
*
* @return the index of the middle element
*/
//@ requires 0 <= attIndex && attIndex < numAttributes();
//@ requires 0 <= left && left <= right && right < numInstances();
protected int partition(int attIndex, int l, int r) {
double pivot = instance((l + r) / 2).value(attIndex);
while (l < r) {
while ((instance(l).value(attIndex) < pivot) && (l < r)) {
l++;
}
while ((instance(r).value(attIndex) > pivot) && (l < r)) {
r--;
}
if (l < r) {
swap(l, r);
l++;
r--;
}
}
if ((l == r) && (instance(r).value(attIndex) > pivot)) {
r--;
}
return r;
}
/**
* Implements quicksort according to Manber's "Introduction to
* Algorithms".
*
* @param attIndex the attribute's index
* @param left the first index of the subset to be sorted
* @param right the last index of the subset to be sorted
*/
//@ requires 0 <= attIndex && attIndex < numAttributes();
//@ requires 0 <= first && first <= right && right < numInstances();
protected void quickSort(int attIndex, int left, int right) {
if (left < right) {
int middle = partition(attIndex, left, right);
quickSort(attIndex, left, middle);
quickSort(attIndex, middle + 1, right);
}
}
/**
* Reads and skips all tokens before next end of line token.
*
* @param tokenizer the stream tokenizer
*/
protected void readTillEOL(StreamTokenizer tokenizer)
throws IOException {
while (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {};
tokenizer.pushBack();
}
/**
* Implements computation of the kth-smallest element according
* to Manber's "Introduction to Algorithms".
*
* @param attIndex the attribute's index
* @param left the first index of the subset
* @param right the last index of the subset
* @param k the value of k
*
* @return the index of the kth-smallest element
*/
//@ requires 0 <= attIndex && attIndex < numAttributes();
//@ requires 0 <= first && first <= right && right < numInstances();
protected int select(int attIndex, int left, int right, int k) {
if (left == right) {
return left;
} else {
int middle = partition(attIndex, left, right);
if ((middle - left + 1) >= k) {
return select(attIndex, left, middle, k);
} else {
return select(attIndex, middle + 1, right, k - (middle - left + 1));
}
}
}
/**
* Help function needed for stratification of set.
*
* @param numFolds the number of folds for the stratification
*/
protected void stratStep (int numFolds){
FastVector newVec = new FastVector(m_Instances.capacity());
int start = 0, j;
// create stratified batch
while (newVec.size() < numInstances()) {
j = start;
while (j < numInstances()) {
newVec.addElement(instance(j));
j = j + numFolds;
}
start++;
}
m_Instances = newVec;
}
/**
* Swaps two instances in the set.
*
* @param i the first instance's index
* @param j the second instance's index
*/
//@ requires 0 <= i && i < numInstances();
//@ requires 0 <= j && j < numInstances();
public void swap(int i, int j){
m_Instances.swap(i, j);
}
/**
* Merges two sets of Instances together. The resulting set will have
* all the attributes of the first set plus all the attributes of the
* second set. The number of instances in both sets must be the same.
*
* @param first the first set of Instances
* @param second the second set of Instances
* @return the merged set of Instances
* @exception IllegalArgumentException if the datasets are not the same size
*/
public static Instances mergeInstances(Instances first, Instances second) {
if (first.numInstances() != second.numInstances()) {
throw new IllegalArgumentException("Instanc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -