📄 logfileqsort.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:

  this.itemIDType = itemIDType;
}

/**
 * Function to change the default block size of the decomposition
 * block algorithm and the decomposition algorithm
 *
 * Note: Small blocks may induce significant errors, while
 * very large blocks may reduce performance (Theta(B*log B)).
 * Also, the blocks permanently occupy a respective amount of memory
 *
 * @param newBlockSize new block size in #(t-id,item-id) pairs
 * @throws MiningException
 */
public void setBlockSize(int newBlockSize) throws MiningException
{
  // here we don't care about the sorting method used,
  //   blocksize is uniformly valid parameter for all decomp-type methods
  decompBlockBlockSize = newBlockSize;
  decompWindowSize = newBlockSize;
  setSortingMethod(sortingMethod);
}


/**
 * Set the method used for sorting and initialise sorting method.
 * <BR>See general description of <code>LogFileSequentialSort</code> class
 *
 * @param newSortingMethod int
 * @throws MiningException
 */
public void setSortingMethod(int newSortingMethod) throws MiningException
{
  sortingMethod = newSortingMethod;

  // reset timers
  sortTimeTaken = 0;
  totalTimeTaken = 0;

  switch (sortingMethod)
  {
    case METHOD_TRIVIAL:
      trivialHashtable = new Hashtable();
      trivialCursor = 1;
      trivialNumTransactions = 0;
      trivialCurrentTransactionID = "";

      // sort time taken cannot be measured accurately, because sorting and parsing is highly interwoven
      sortTimeTaken = -1;
      return;

    case METHOD_GLOBALBLOCK:
      globalBlockCursor = 0;
      return;

    case METHOD_DECOMPBLOCK:
      decompBlockInitialised = false;
      decompBlockDataPresent = false;
      decompBlockCursor = 0;
      decompBlockMaxTransactionNumber = 0;

      // initialise sort buffer
      decompBlockSortArray = new DecompBlockBaseClass[decompBlockBlockSize];
      for (int i=0;i<decompBlockBlockSize;i++)
        decompBlockSortArray[i] = new DecompBlockBaseClass();

      // initialise global hashtable if debugging is switched on
      // and reset error counters
      if(decompBlockDebug)
      {
        decompBlockDebugHashtable = new Hashtable();
        decompBlockDebugWrongSingle = 0;
        decompBlockDebugWrongMulti = 0;
      }

      return;

    case METHOD_DECOMP:
      decompListEntries = 0;
      decompHashtable = new Hashtable(decompWindowSize);
      decompTransactionIDCounter = 1;

      // set global list pointers to null
      decompListFirstElement = null;
      decompListLastElement = null;

      return;
  }

  throw new MiningException("setSortingMethod(): Unknown sorting method (newSortingMethod = " + newSortingMethod + ").");
}


  /**
   * The original getMetaData is overwritten, because meta data differs from original data source.<BR>
   * To retrieve original meta data, call getPreprocessor().getMetaData()
   *
   * @throws MiningException
   * @return MiningDataSpecification
   */
  public MiningDataSpecification getMetaData() throws MiningException
  {
    return processedStream.metaDataProcessed;
  }

  /**
   * Read() function returns a mining vector that is "in order" w/ respect to transaction id.
   * Read() also applies an item index
   *
   * @throws MiningException
   * @return MiningVector
   */
   public MiningVector read() throws MiningException
   {
     if (!nextMiningVectorUpdated)
       return nextMiningVector;

     // the next time the extra labeling is not required
     nextMiningVectorUpdated = false;

     // check if (current) mining vector is member of the last session (transaction id equal to the one last seen)
     if(transactionIDLastSeen!= nextTransactionID)
     {
       itemIndexCurrent = 1;
       transactionIDLastSeen = nextTransactionID;
     } else
       itemIndexCurrent++;

     // set session number
     nextMiningVector.setValue(2, itemIndexCurrent);

     // convert item ID type
     convertItemId();

     return nextMiningVector;
   }

   /**
    * Converts item ID string to item ID type. If not possible,
    * the default value (0, false, etc.) of item ID type is used.
    *
    * @throws MiningException cannot item ID type not supported
    */
   private void convertItemId() throws MiningException {

     if (itemIDType != CategoricalAttribute.STRING) {
       double key = nextMiningVector.getValue(1);
       CategoricalAttribute catAtt = (CategoricalAttribute)
         nextMiningVector.getMetaData().getMiningAttribute(1);
       Category cat = catAtt.getCategory(key);
       Object catOb = null;
       if ( itemIDType == CategoricalAttribute.DOUBLE )
         catOb = new Double(0);
       else if ( itemIDType == CategoricalAttribute.FLOAT )
         catOb = new Float(0);
       else if ( itemIDType == CategoricalAttribute.INTEGER )
         catOb = new Integer(0);
       else if ( itemIDType == CategoricalAttribute.BOOLEAN )
         catOb = new Boolean(false);
       else
         throw new MiningException("itemID type " + itemIDType + " not supported");
       try {
         String cval  = (String) cat.getValue();
         Object co = null;
         if ( itemIDType == CategoricalAttribute.DOUBLE )
           co = new Double(cval);
         else if ( itemIDType == CategoricalAttribute.FLOAT )
           co = new Float(cval);
         else if ( itemIDType == CategoricalAttribute.INTEGER )
           co = new Integer(cval);
         else if ( itemIDType == CategoricalAttribute.BOOLEAN )
           co = new Boolean(cval);
         catOb = co;
       }
       catch (Exception ex) {
       }
       Category catNew = new Category(catOb);

       key = catAtt.getKey(catNew);
       if (Category.isMissingValue(key))
         key = catAtt.addCategory(catNew);
       nextMiningVector.setValue(1, key);
     }
   }

   /**
    * Move is implemented as a loop based on next(), so there is no real performance gain using move()
    *
    * @param position relative amount of data sets to advance
    * @throws MiningException
    * @return boolean result of last call to next()
    */
   public boolean move( int position ) throws MiningException
   {
     // check parameter (position must be >0)
     if (position <=0)
       throw new MiningException("Cannot move backwards or advance zero lines in file stream.");

     // moving is realised using calls to next()
     boolean nextResult = false;
     for(int i=0;i<position;i++)
     {
       nextResult = next();
     };
     return nextResult;
   }

   /**
    * next() implementation based on a trivial "sorting" algorithm.<BR>
    * A hashtable is used to identify individual transaction ids - for
    * every "new" (not previously processed) transaction id found, all
    * entries containing that t-id are extracted from the data stream.
    *
    * @throws MiningException
    * @return boolean
    */
   private boolean trivialNext() throws MiningException
   {
     boolean haveNextVector;
     MiningVector miningVector;
     String transactionString;

     // search loop, left through return
     while(true)
     {
       haveNextVector = processedStream.next();

       if (haveNextVector)
       {
         // get next mining vector and retrieve transaction id string
         miningVector = processedStream.read();
         transactionString = miningVector.toVector().elementAt(0).toString();

         // are we working on a certain transaction so far?
         if (trivialCurrentTransactionID.equals(""))
         {
           trivialCursor++;

           // check if the current transaction is already in hash table
           if (trivialHashtable.containsKey(transactionString))
           {
             // we know that key, so we just continue
             continue;
           } else
           {
             // found a new unique transaction
             // increase transaction counter
             trivialNumTransactions++;

             // add transaction id to hashtable
             trivialHashtable.put(transactionString, new Integer(trivialNumTransactions));

             // set new "current" transaction id
             trivialCurrentTransactionID = transactionString;
             nextTransactionID++;

             nextMiningVector = miningVector;
             return true;
           }
         }

         // current mining vector still belongs to the same session? simplest scenario - just output
         if (trivialCurrentTransactionID.equals(transactionString))
         {
           nextMiningVector = miningVector;

           return true;
         }

         // we stumbled over a transaction id which is different from what we are looking for
         continue;

       }
       else // haveNextVector failed
       {
         // if there is no current transaction and no vector to read, we're done
         if (trivialCurrentTransactionID.equals(""))
         {
           return false;
         }

         // end of file reached, but there may be work left, so go back to cursor and have a look

         // delete current session
         trivialCurrentTransactionID = "";

         // reset data source
         processedStream.reset();

         // we move to trivialCursor -1, because the following call of next() also advances "cursor"
         processedStream.move(trivialCursor-1);
         continue;
       } // haveNextVector

     } // while(true)
   }

   /**
    * next() implementation based on an algorithm that uses a single global block.<BR>
    * A memory structure comprising all the relevant data of the data source is
    * created und a merge sort is used for sorting purposes.<BR>
    * The sorting speed is very good even for very large files,
    * but the memory requirement is likely to cause trouble sooner or later (Theta(L))
    *
    * @throws MiningException
    * @return boolean
    */
   private boolean globalBlockNext() throws MiningException
   {
     // File needs to be read into memory
     // initialisiation, including all the sorting is done in advance and only once
     if (!globalBlockInitialised)
     {
       globalBlockInitialised = true;

       GlobalBlockListClass globalBlockSortList = new GlobalBlockListClass();
       GlobalBlockListClass anchor = globalBlockSortList;

       // number of valid entries read (required to get buffer)
       globalBlockValidEntries = 0;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -