📄 fileschema.java
字号:
package shared;
import java.io.*;
import java.util.*;
import java.lang.*;
/** This class represents an MLC++ names file. The FileSchema's main task is
* to interpret the values in a .data file. Currently, a FileSchema maintains
* a raw list of attribute infos or COLUMNS, information about which columns
* should represent label or weight values in the final schema, and an
* optional loss matrix. <P>
* FileSchemas may be created from a names file, or from a preexisting
* array of attribute infos which may be built programmatically. The label
* column, weight column, and loss matrix may all be set programatically. <P>
* At any time, a standard MLC++ Schema may be created from the FileSchema
* through the create_schema() function. <P>
* Displaying a FileSchema will do so in the same format used to read
* FileSchemas from names files.
* @author James Louis 11/30/2001 Ported to Java.
* @author Dan Sommerfield 2/26/97 Initial revision (.h, .c)
*/
public class FileSchema
{
/** Information on Attributes. **/
AttrInfo[] attrInfos;
/** LossKeyword value. **/
public static final byte nomatrix = 0;
/** LossKeyword value. **/
public static final byte nodefault = 1;
/** LossKeyword value. **/
public static final byte adefault = 2;
/** LossKeyword value. **/
public static final byte distance = 3;
/** Byte value indicating a character is a section delimeter.
*/
public static final byte sectionDelimiter = 0;
/** Byte value indicating an end-of-file character has been reached.
*/
public static final byte sectionEscape = -1;
/** Byte value indicating a character is an alpha-numerical character.
*/
public static final byte sectionCharacter = 1;
/** Maximum size for a String value.
*/
public static final int MAX_INPUT_STRING_SIZE = 1000;
/** **/
private byte lossKeyword;
/** Loss arguments.
*/
double[] lossArgs;
/** Loss entries for this schema.
*/
FSLossEntry[] lossEntries;
/** Number of the Label value column. **/
int labelColumn;
/** Number of the weight column. **/
int weightColumn;
/** TRUE if weights should be ignored, FALSE otherwise. **/
boolean ignoreWeightColumn;
/** Returns TRUE if the weight column is to be ignored, FALSE otherwise.
* @return TRUE if weight column is to be ignored, FALSE otherwise.
*/
public boolean get_ignore_weight_column()
{
return ignoreWeightColumn;
}
/** Returns the column number of the column containing weight values.
* @return A column number.
*/
public int get_weight_column()
{
return weightColumn;
}
/** Returns the column number of the column containing labels.
* @return A column number.
*/
public int get_label_column()
{
return labelColumn;
}
/** Apply the loss specification stored in this FileSchema to the given schema. The
* InstanceList corresponding to the schema should be fully read when this
* function is called, to make sure that any non-fixed nominals in the schema have
* all their values showing. Any InstanceList calling this function on its schema
* MUST call set_schema with the new schema afterwards to ensure that all
* instances still have the same schema.
* @param s The schema to which the loss specification is to be applied.
*/
public void apply_loss_spec(Schema s)
{
//just return if the loss matrix was never set
if (lossKeyword == nomatrix)
return;
System.out.println("Warning-->FileSchema::apply_loss_spec: this function "
+ "is not currently implemented, reaching this point may yield "
+ "undesirable results");
}
/** Constructor.
* @param namesFile Name of the namesfile containing the schema to be used.
*/
public FileSchema(String namesFile)
{
lossKeyword = nomatrix;
lossArgs = new double[3];
labelColumn = -1;
weightColumn = -1;
ignoreWeightColumn = false;
try
{
BufferedReader in = new BufferedReader(new FileReader(namesFile));
attrInfos = new AttrInfo[0];
read_names(in);
check_for_duplicates();
} catch(FileNotFoundException e)
{
e.printStackTrace();
}
}
/** Copy constructor.
* @param other The FileSchema to be copied.
*/
public FileSchema(FileSchema other)
{
attrInfos = new AttrInfo[other.attrInfos.length];
lossKeyword = other.lossKeyword;
lossArgs = new double[other.lossArgs.length];
for(int i = 0 ; i<lossArgs.length ; i++)
lossArgs[i] = other.lossArgs[i];
lossEntries = other.lossEntries;
labelColumn = other.labelColumn;
weightColumn = other.weightColumn;
ignoreWeightColumn = other.ignoreWeightColumn;
//because attrInfos is an array of references, we need to make a deep
//copy instead of just copying the array
try
{
for(int i=0 ; i<attrInfos.length ; i++)
attrInfos[i] =(AttrInfo)other.attrInfos[i].clone();
} catch(CloneNotSupportedException e)
{
Error.err("FileSchema:copyConstructor: Clone not"
+ " supported exception caught");
}
OK();
}
/** Returns the number of attributes in this FileSchema.
* @return The number of attributes.
*/
public int num_attr()
{
return attrInfos.length;
}
/** Sets whether the weighted column should be ignored.
* @param i TRUE if the weight column should be ignored, FALSE otherwise.
*/
private void set_ignore_weight_column(boolean i)
{
ignoreWeightColumn = i;
}
/** Reads the names file and builds this FileSchema. This is a support function to
* the constructors.
*
* @param in The reader from which the file information is accessed.
*/
private void read_names(BufferedReader in)
{
boolean weightIsAttribute = true;
set_ignore_weight_column(!weightIsAttribute);
//First, try to read an attribute info, with the name "Label".
//If we get a single name "config", read a config section instead.
//Otherwise, enter compatibility mode.
skip_white_comments_same_line(in);
boolean haveConfig = false;
AttrInfo labelInfo = read_attr_info(in, "Label");
MLJ.ASSERT(labelInfo != null,"FileSchema.read_names: labelInfo == null.");
if (!labelInfo.can_cast_to_nominal())
Error.err("FileSchema::read_names: Compatibility-"
+ "mode label was specified as \'continous\' -->fatal_error");
else if (labelInfo.cast_to_nominal().is_fixed()&&
labelInfo.cast_to_nominal().num_values()== 1)
{
String singleName = labelInfo.cast_to_nominal().get_value(Globals.FIRST_NOMINAL_VAL);
Error.err("FileSchema:read_names: "
+ "I don\'t think I should reach this, this case not handled!");
if (singleName.equals("config"))
{
labelInfo = null;
haveConfig = true;
}
else if (singleName.equals("nolobal"))
{
labelInfo = null;
}
else
System.out.println("Warning-->FileSchema::read_names: "
+ "compatibility mode label was specified with the single"
+ " value " + singleName + ". This"
+ " is likely a mistake.");
}
//if we have a config section, read config, then attributes
if (haveConfig)
{
MLJ.ASSERT(labelInfo == null,"FileSchema.read_names:labelInfo != null.");
OptionServer configOptions = new OptionServer();
read_config(in, configOptions);
read_attributes(in,true);
apply_config(configOptions);
}
// if in compatibility mode, read attributes and tack a label on end
else
{
read_attributes(in, false);
if (labelInfo != null)
{
labelColumn = attrInfos.length;
AttrInfo[] temp = new AttrInfo[attrInfos.length +1];
for(int i=0 ; i<attrInfos.length ; i++)
temp[i]=attrInfos[i];
attrInfos = null;
attrInfos = temp;
temp = null;
attrInfos[attrInfos.length -1] = labelInfo;
}
}
OK();
}
/** Read a configuration section from a file and store it in the provided option
* server.
*
* @param in Reader from which file data will be accessed.
* @param configOptions The option server that stores option information.
*/
private void read_config(BufferedReader in, OptionServer configOptions)
{
String optName;
skip_white_comments_same_line(in);
while((optName = read_section_ws(in, ":\n" , "\t\r"))!= "endconfig")
{
try
{
if ((char)in.read()!= ':')
System.out.println("FileSchema::read_config:expecting a colon after the configuration option \"" + optName + "\"");
} catch(IOException e)
{
}
skip_white_comments_same_line(in);
String optVal = read_section(in, "\n" , "\r");
configOptions.set_option(optName,optVal);
skip_white_comments_same_line(in);
}
skip_white_comments_same_line(in);
}
/** Ensures that the schema has no duplicate attributes. We use an n^2 algorithm--
* this could be done faster by sorting the list first.
*
*/
private void check_for_duplicates()
{
boolean dups = false;
for(int i=0 ; i<attrInfos.length-1 ; i++)
for(int j=i+1 ; j<attrInfos.length ; j++)
if (attrInfos[i].name().equals(attrInfos[j].name()))
{
if (!dups==true)
Error.err("FileSchema::"
+ "check_for_duplicates: duplicate attribute "
+ "names detected...");
Error.err("Duplicate attribute: "
+ attrInfos[i].name());
dups = true;
}
if (dups == true)
Error.err("check_for_duplicates");
}
/** Checks Integrity constraints: <BR>
* The labelColumn must be either -1, or refer to a NominalAttrInfo. <BR>
* The weightColumn must be either -1, or refer to a RealAttrInfo. <BR>
* Duplicate column names are not permitted. <BR>
* We should check that the number of label values > 0. However, we cannot make
* the check at this stage because the label info may be a non-fixed value set
* which has not yet accumulated any values (i.e. before reading the list!)
*
*/
private void OK()
{
if (labelColumn == -1)
{
if (lossKeyword != nomatrix || lossEntries.length != 0)
Error.err("FileSchema::OK:not OK");
}
else
{
if (!attrInfos[labelColumn].can_cast_to_nominal())
Error.err("FileSchema::OK:not OK");
}
if (weightColumn!=-1)
if (!attrInfos[weightColumn].can_cast_to_real())
Error.err("FileSchema::OK:not OK");
//check_for_duplicates()
}
/** Reads attributes from the names file into an array of attributes maintained by
* this FileSchema class. If lossOK is TRUE, than the word "loss" may appear as at
* the top of the attribute list. The section between loss and endloss will be
* interpreted as a loss matrix specification.
* @param namesFile Reader allowing access to the namesfile.
* @param lossOK TRUE if the the loss is correct. FALSE sets of a parse error.
*/
private void read_attributes(BufferedReader namesFile, boolean lossOK)
{
// FileSchema should have no attributes
if (attrInfos.length != 0)
Error.err("attributes already in file schema, it has" +attrInfos.length + " as a length");
String attrName;
skip_white_comments_same_line(namesFile);
try
{
while(namesFile.ready()!= false)
{
boolean[] sameLine = new boolean[1];
attrName = read_word(namesFile, false, sameLine);
if (attrName.equals("loss")&&(char)namesFile.read()!= ':')
{
if (!lossOK == true)
{
Error.err("The loss specification must appear "
+ "between the config section and the list of "
+ "attributes. It may not be used in compatibility "
+ "mode");
}
read_loss_spec(namesFile);
lossOK = false;
// read the word following the loss specification
attrName = read_word(namesFile, false, sameLine);
}
if ((char)namesFile.read()!= ':')
Error.err(" "
+ "Expecting a \':\' following "
+ "attribute name " + attrName);
skip_white_comments_same_line(namesFile);
AttrInfo ai = read_attr_info(namesFile, attrName);
AttrInfo[] temp = new AttrInfo[attrInfos.length+1];
for(int i=0 ; i<attrInfos.length ; i++)
temp[i]=attrInfos[i];
attrInfos = null;
attrInfos = temp;
temp = null;
attrInfos[attrInfos.length-1] = ai;
skip_white_comments_same_line(namesFile);
}
} catch(IOException e)
{
Error.err("file can\'t be read");
}
}
/** Set an attribute info. Makes a copy of the attribute info which is passed in.
* @param i Number of the attribute.
* @param a Attribute information.
*/
public void set_attr_info(int i, AttrInfo a)
{
if (i<0 || i >= attrInfos.length)
Error.err("FileSchema::set_attr_info: index "
+ i + " is out of range -->fatal_error");
attrInfos[i] = null;
attrInfos[i] = a;
OK();
}
/** Reads a list of "words" (as described in read_word()) that correspond to
* nominal values. Certain special words denote ignored attributes, non-fixed
* nominals, real attribute infos, or linear attribute infos. If these are
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -