📄 fileschema.java

📁 java数据挖掘算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
package shared;
import java.io.*;
import java.util.*;
import java.lang.*;

/** This class represents an MLC++ names file. The FileSchema's main task is
 * to interpret the values in a .data file. Currently, a FileSchema maintains
 * a raw list of attribute infos or COLUMNS, information about which columns
 * should represent label or weight values in the final schema, and an
 * optional loss matrix.									<P>
 * FileSchemas may be created from a names file, or from a preexisting
 * array of attribute infos which may be built programmatically. The label
 * column, weight column, and loss matrix may all be set programatically.	<P>
 * At any time, a standard MLC++ Schema may be created from the FileSchema
 * through the create_schema() function.						<P>
 * Displaying a FileSchema will do so in the same format used to read
 * FileSchemas from names files.
 * @author James Louis	11/30/2001	Ported to Java.
 * @author Dan Sommerfield 2/26/97 Initial revision (.h, .c)
 */

public class FileSchema
{
/** Information on Attributes. **/
   AttrInfo[] attrInfos;

/** LossKeyword value. **/
   public static final byte nomatrix = 0;

/** LossKeyword value. **/
   public static final byte nodefault = 1;

/** LossKeyword value. **/
   public static final byte adefault = 2;

/** LossKeyword value. **/
   public static final byte distance = 3;

   /** Byte value indicating a character is a section delimeter.
    */
   public static final byte sectionDelimiter = 0;

   /** Byte value indicating an end-of-file character has been reached.
    */
   public static final byte sectionEscape = -1;

   /** Byte value indicating a character is an alpha-numerical character.
    */
   public static final byte sectionCharacter = 1;

   /** Maximum size for a String value.
    */
   public static final int MAX_INPUT_STRING_SIZE = 1000;

/** **/
   private byte lossKeyword;

   /** Loss arguments.
    */
   double[] lossArgs;

   /** Loss entries for this schema.
    */
   FSLossEntry[] lossEntries;

/** Number of the Label value column. **/
   int labelColumn;

/** Number of the weight column. **/
   int weightColumn;

/** TRUE if weights should be ignored, FALSE otherwise. **/
   boolean ignoreWeightColumn;

   /** Returns TRUE if the weight column is to  be ignored, FALSE otherwise.
    * @return TRUE if weight column is to be ignored, FALSE otherwise.
    */
   public boolean get_ignore_weight_column()
   {
      return ignoreWeightColumn;
   }

   /** Returns the column number of the column containing weight values.
    * @return A column number.
    */
   public int get_weight_column()
   {
      return weightColumn;
   }

   /** Returns the column number of the column containing labels.
    * @return A column number.
    */
   public int get_label_column()
   {
      return labelColumn;
   }

   /** Apply the loss specification stored in this FileSchema to the given schema. The
    * InstanceList corresponding to the schema should be fully read when this
    * function is called, to make sure that any non-fixed nominals in the schema have
    * all their values showing. Any InstanceList calling this function on its schema
    * MUST call set_schema with the new schema afterwards to ensure that all
    * instances still have the same schema.
    * @param s The schema to which the loss specification is to be applied.
    */
   public void apply_loss_spec(Schema s)
   {
      //just return if the loss matrix was never set
      if (lossKeyword == nomatrix)
         return;
      System.out.println("Warning-->FileSchema::apply_loss_spec: this function " 
             + "is not currently implemented, reaching this point may yield " 
             + "undesirable results");
   }

   /** Constructor.
    * @param namesFile Name of the namesfile containing the schema to be used.
    */
   public FileSchema(String namesFile)
   {
      lossKeyword = nomatrix;
      lossArgs = new double[3];
      labelColumn = -1;
      weightColumn = -1;
      ignoreWeightColumn = false;
      try
      {
         BufferedReader in = new BufferedReader(new FileReader(namesFile));
         attrInfos = new AttrInfo[0];
         read_names(in);
         check_for_duplicates();
      } catch(FileNotFoundException e)
      {
         e.printStackTrace();
      }
   }

   /** Copy constructor.
    * @param other The FileSchema to be copied.
    */
   public FileSchema(FileSchema other)
   {
      attrInfos = new AttrInfo[other.attrInfos.length];
      lossKeyword = other.lossKeyword;
      lossArgs = new double[other.lossArgs.length];
      for(int i = 0 ; i<lossArgs.length ; i++)
         lossArgs[i] = other.lossArgs[i];
      lossEntries = other.lossEntries;
      labelColumn = other.labelColumn;
      weightColumn = other.weightColumn;
      ignoreWeightColumn = other.ignoreWeightColumn;

      //because attrInfos is an array of references, we need to make a deep
      //copy instead of just copying the array
      try
      {
         for(int i=0 ; i<attrInfos.length ; i++)
            attrInfos[i] =(AttrInfo)other.attrInfos[i].clone();
      } catch(CloneNotSupportedException e)
      {
         Error.err("FileSchema:copyConstructor: Clone not" 
                + " supported exception caught");
      }

      OK();
   }

   /** Returns the number of attributes in this FileSchema.
    * @return The number of attributes.
    */
   public int num_attr()
   {
      return attrInfos.length;
   }

   /** Sets whether the weighted column should be ignored.
    * @param i TRUE if the weight column should be ignored, FALSE otherwise.
    */
   private void set_ignore_weight_column(boolean i)
   {
      ignoreWeightColumn = i;
   }

   /** Reads the names file and builds this FileSchema. This is a support function to
    * the constructors.
    *
    * @param in The reader from which the file information is accessed.
    */
   private void read_names(BufferedReader in)
   {
      boolean weightIsAttribute = true;
      set_ignore_weight_column(!weightIsAttribute);

      //First, try to read an attribute info, with the name "Label".
      //If we get a single name "config", read a config section instead.
      //Otherwise, enter compatibility mode.

      skip_white_comments_same_line(in);
      boolean haveConfig = false;

      AttrInfo labelInfo = read_attr_info(in, "Label");

      MLJ.ASSERT(labelInfo != null,"FileSchema.read_names: labelInfo == null.");

      if (!labelInfo.can_cast_to_nominal())
         Error.err("FileSchema::read_names: Compatibility-" 
                + "mode label was specified as \'continous\' -->fatal_error");
      else if (labelInfo.cast_to_nominal().is_fixed()&& 
             labelInfo.cast_to_nominal().num_values()== 1)
      {
         String singleName = labelInfo.cast_to_nominal().get_value(Globals.FIRST_NOMINAL_VAL);
         Error.err("FileSchema:read_names: " 
                + "I don\'t think I should reach this, this case not handled!");
         if (singleName.equals("config"))
         {
            labelInfo = null;
            haveConfig = true;
         }
         else if (singleName.equals("nolobal"))
         {
            labelInfo = null;
         }
         else
         System.out.println("Warning-->FileSchema::read_names: " 
                + "compatibility mode label was specified with the single" 
                + " value " + singleName + ".  This" 
                + " is likely a mistake.");
      }

      //if we have a config section, read config, then attributes   

      if (haveConfig)
      {
         MLJ.ASSERT(labelInfo == null,"FileSchema.read_names:labelInfo != null.");
         OptionServer configOptions = new OptionServer();
         read_config(in, configOptions);
         read_attributes(in,true);
         apply_config(configOptions);
      }
      // if in compatibility mode, read attributes and tack a label on end
      else
      {
         read_attributes(in, false);
         if (labelInfo != null)
         {
            labelColumn = attrInfos.length;
            AttrInfo[] temp = new AttrInfo[attrInfos.length +1];
            for(int i=0 ; i<attrInfos.length ; i++)
               temp[i]=attrInfos[i];
            attrInfos = null;
            attrInfos = temp;
            temp = null;
            attrInfos[attrInfos.length -1] = labelInfo;
         }
      }
      OK();
   }

   /** Read a configuration section from a file and store it in the provided option
    * server.
    *
    * @param in Reader from which file data will be accessed.
    * @param configOptions The option server that stores option information.
    */
   private void read_config(BufferedReader in, OptionServer configOptions)
   {
      String optName;
      skip_white_comments_same_line(in);
      while((optName = read_section_ws(in, ":\n" , "\t\r"))!= "endconfig")
      {
         try
         {
            if ((char)in.read()!= ':')
               System.out.println("FileSchema::read_config:expecting a colon after the configuration option \"" + optName + "\"");
         } catch(IOException e)
         {
         }
         skip_white_comments_same_line(in);
         String optVal = read_section(in, "\n" , "\r");
         configOptions.set_option(optName,optVal);
         skip_white_comments_same_line(in);
      }
      skip_white_comments_same_line(in);
   }

   /** Ensures that the schema has no duplicate attributes. We use an n^2 algorithm--
    * this could be done faster by sorting the list first.
    *
    */
   private void check_for_duplicates()
   {
      boolean dups = false;
      for(int i=0 ; i<attrInfos.length-1 ; i++)
         for(int j=i+1 ; j<attrInfos.length ; j++)
            if (attrInfos[i].name().equals(attrInfos[j].name()))
            {
               if (!dups==true)
                  Error.err("FileSchema::" 
                         + "check_for_duplicates: duplicate attribute " 
                         + "names detected...");
               Error.err("Duplicate attribute: " 
                      + attrInfos[i].name());
               dups = true;
            }
      if (dups == true)
         Error.err("check_for_duplicates");
   }

   /** Checks Integrity constraints:                                               <BR>
    * The labelColumn must be either -1, or refer to a NominalAttrInfo.           <BR>
    * The weightColumn must be either -1, or refer to a RealAttrInfo.             <BR>
    * Duplicate column names are not permitted.                                   <BR>
    *  We should check that the number of label values > 0. However, we cannot make
    * the check at this stage because the label info may be a non-fixed value set
    * which has not yet accumulated any values (i.e. before reading the list!)
    *
    */
   private void OK()
   {
      if (labelColumn == -1)
      {
         if (lossKeyword != nomatrix || lossEntries.length != 0)
            Error.err("FileSchema::OK:not OK");
      }
      else
      {
         if (!attrInfos[labelColumn].can_cast_to_nominal())
            Error.err("FileSchema::OK:not OK");
      }

      if (weightColumn!=-1)
         if (!attrInfos[weightColumn].can_cast_to_real())
            Error.err("FileSchema::OK:not OK");
      //check_for_duplicates()	 
   }

   /** Reads attributes from the names file into an array of attributes maintained by
    * this FileSchema class. If lossOK is TRUE, than the word "loss" may appear as at
    * the top of the attribute list.  The section between loss and endloss will be
    * interpreted as a loss matrix specification.
    * @param namesFile Reader allowing access to the namesfile.
    * @param lossOK TRUE if the the loss is correct. FALSE sets of a parse error.
    */
   private void read_attributes(BufferedReader namesFile, boolean lossOK)
   {
      // FileSchema should have no attributes
      if (attrInfos.length != 0)
         Error.err("attributes already in file schema, it has" +attrInfos.length + " as a length");
      String attrName;
      skip_white_comments_same_line(namesFile);
      try
      {
         while(namesFile.ready()!= false)
         {
            boolean[] sameLine = new boolean[1];

            attrName = read_word(namesFile, false, sameLine);
            if (attrName.equals("loss")&&(char)namesFile.read()!= ':')
            {
               if (!lossOK == true)
               {
                  Error.err("The loss specification must appear " 
                         + "between the config section and the list of " 
                         + "attributes.  It may not be used in compatibility " 
                         + "mode");
               }
               read_loss_spec(namesFile);
               lossOK = false;
               // read the word following the loss specification
               attrName = read_word(namesFile, false, sameLine);
            }
            if ((char)namesFile.read()!= ':')
               Error.err(" " 
                      + "Expecting a \':\' following " 
                      + "attribute name " + attrName);
            skip_white_comments_same_line(namesFile);
            AttrInfo ai = read_attr_info(namesFile, attrName);

            AttrInfo[] temp = new AttrInfo[attrInfos.length+1];
            for(int i=0 ; i<attrInfos.length ; i++)
               temp[i]=attrInfos[i];
            attrInfos = null;
            attrInfos = temp;
            temp = null;
            attrInfos[attrInfos.length-1] = ai;

            skip_white_comments_same_line(namesFile);
         }
      } catch(IOException e)
      {
         Error.err("file can\'t be read");
      }
   }

   /** Set an attribute info. Makes a copy of the attribute info which is passed in.
    * @param i Number of the attribute.
    * @param a Attribute information.
    */
   public void set_attr_info(int i, AttrInfo a)
   {
      if (i<0 || i >= attrInfos.length)
         Error.err("FileSchema::set_attr_info: index " 
                + i + " is out of range -->fatal_error");
      attrInfos[i] = null;
      attrInfos[i] = a;
      OK();
   }

   /** Reads a list of "words" (as described in read_word()) that correspond to
    * nominal values. Certain special words denote ignored attributes, non-fixed
    * nominals, real attribute infos, or linear attribute infos. If these are
12 3 下一页
💿 文件大小 441 K
👤 上传用户 l2335800
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#java #数据挖掘算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -