logfileqsort.java

来自「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」· Java 代码 · 共 1,284 行 · 第 1/4 页
JAVA
1,284 行
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

 /**
  * Title: XELOPES Data Mining Library
  * Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
  * Copyright: Copyright (c) 2004 Prudential Systems AG, Michael Krautmacher
  * @author Michael Krautmacher (michael@krautmacher.com)
  * @version 1.1
  */

 /**
  * An (almost) transparent derivative of a <code>MiningInputStream</code> that
  * implements a sorting / grouping functionality to enable the use of
  * sequential analysis of (web server) log files.
  *
  * The class provides a stream comprising the elements required for the
  * sequential analysis algorithm and can be accessed just like a
  * <code>MiningFileStream</code> based class.
  *
  *
  * Various sorting / grouping mechanisms have been implemented to allow
  * for processing of log files of _any_ size.
  *
  * Definitions:
  * L = number of transaction-item data pairs
  * T = number of transactions; T << L, but T = Theta(L) is assumed
  * B = size of a processing block (constant)
  *
  * Algorithms:
  *
  * - Trivial: May only be used for very small log files due to a complexity
  *  of Theta(L*T), equiv Theta(L^2). Memory requirement is Theta(T).
  *  Because it manages a global hashtable, it's memory requirements = Theta(T)
  *  Note: This algorithm returns an exact result. Do not use except for testing
  *  with very small data sources.
  *
  * - Global Block: May be used for for small to medium log files.
  *  The complexity is O(L*log(L)). The whole datasource is loaded into memory
  *  for processing (This requires memory of Theta(L)).
  *  Note: This algorithm returns an exact result. It is suitable for processing
  *  of certain live data sources: The data source is read only once - however,
  *  there is not output (and no sorting) until a call to the undelying next()
  *  has returned false. This algorithm requires the highest amount of memory of
  *  all implementations, so be careful with large log files.
  *
  * - Decomposition Block: Works on data sources of _any_ size, but should be
  *  used on medium+ sized data sources. The complexity is Theta(L*log B)
  *  equiv Theta(L). B is a constant, which can be chosen virtually arbitrarily,
  *  but small B's (block sizes) will induce a certain amount of errors, however.
  *  This algorithm only uses memory of the amount Theta(B)
  *  Note: This algorithm does not return an exact result, if B < L (typical case)
  *  It can be used on any live data source provided the stream is not reset
  *  (i.e. no "second run" is possible)
  *  WARNING: There is a debug switch for evaluation of the errors induced by
  *  the usage of fixed size blocks. Turning debugging on will affect the memory
  *  usage, which will change to Theta(L+B)
  *
  * - Decomposition: Works on data sources of _any_ size, but should be
  *  used on medium+ sized data sources. The complexity is Theta(L*log B)
  *  equiv Theta(L). B is a constant, which can be chosen virtually arbitrarily,
  *  but small B's (window sizes) will induce a certain amount of errors, however.
  *  The technique used is a moving window and a scheduler.
  *  This algorithm only uses memory of the amount Theta(B).
  *  Note: This algorithm does not necessarily return an exact result, if B < L
  *  (typical case). The error rate should be far lower then the one of
  *  decomposition block assuming the same block size. However, the sorting
  *  speed is far lower as well.
  *  It can be used on any live data source provided the stream is not reset
  *  (i.e. no "second run" is possible).
  *  WARNING: There is a switch to turn the scheduler off. Doing so might
  *  affect the error rate. Always leave the scheduler on except for testing.
  */

package com.prudsys.pdm.Input.Records.Log;

import java.util.Arrays;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;

import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;

public class LogFileQSort extends MiningInputStream {

  // constants for sorting algorithms to be used w/ setSortingMethod()
  public static final int METHOD_TRIVIAL = 1;
  public static final int METHOD_GLOBALBLOCK = 2;
  public static final int METHOD_DECOMPBLOCK = 3;
  public static final int METHOD_DECOMP = 4;

  // the data source delivering unsorted, but preprocessed data
  private LogFileQParse processedStream;

  private int sortingMethod;

  // The data type, the item ID string to be converted
  private int itemIDType = CategoricalAttribute.STRING;

  // mining vector returned by a call to the read() function
  private MiningVector nextMiningVector;
  private long nextTransactionID;
  // for read() function - indicates wether there was a call to next() (== update) in between or not
  private boolean nextMiningVectorUpdated;

  // variables for item index of current (!) session
  private int itemIndexCurrent;
  private long transactionIDLastSeen;

  //statistical variables
  // time taken (in ms, time taken for the "pure" sorting, not including parsing)
  public long sortTimeTaken;
  public long totalTimeTaken;

/* *************************************************************** */
/* ************* Variables for algorithm "trivial" *************** */
/* *************************************************************** */

  private Hashtable trivialHashtable;
  private int trivialCursor;
  private String trivialCurrentTransactionID;
  private int trivialNumTransactions;


 /* *************************************************************** */
 /* ********** Variables for algorithm "global block" ************* */
 /* *************************************************************** */

 // initialising that variable here is sort of a dirty trick -
 // a "reset" does not purge the results of previous sorting efforts
private boolean globalBlockInitialised = false;
private int globalBlockValidEntries;
private int globalBlockCursor;

// data structure that is used for sorting
private GlobalBlockBaseClass globalBlockSortArray[];

private class GlobalBlockBaseClass
 {
   public long transactionIDNumber;
   public double itemIDkey;
 }

private class GlobalBlockListClass extends GlobalBlockBaseClass
{
  GlobalBlockListClass next = null;
}

// a comparator for the global sort memory structure
private class GlobalBlockComparator implements Comparator
 {
   public int compare(Object o1, Object o2)
   {
     long i1 = ((GlobalBlockBaseClass)o1).transactionIDNumber;
     long i2 = ((GlobalBlockBaseClass)o2).transactionIDNumber;

     if (i1==i2)
       return 0;
     else
       return (i1<i2)?-1:1;

   }//end compare()
 }//end class TheComparator


 /* *************************************************************** */
 /* ********** Variables for algorithm "decomp block" ************* */
 /* *************************************************************** */

// debugging variables:
 /* *********** debugging switch ************************* */
private static final boolean decompBlockDebug = false;
 /* *********** debugging switch ************************* */

// global hashtable used for debugging
private Hashtable decompBlockDebugHashtable;

// debugging statistics
protected long decompBlockDebugWrongSingle;    // number of wrong (artificial) transaction
protected long decompBlockDebugWrongMulti;    // number of wrong (artificial) transaction appearing in multiple blocks

// regular variables:
private boolean decompBlockInitialised;
private boolean decompBlockDataPresent;   // block contains valid data
private int decompBlockBlockSize = 100000; // block size - initialised w/ a default, but can be changed w/ setBlockSize()
private int decompBlockEntries;           // number of entries in block
private int decompBlockCursor;            // read cursor
private int decompBlockMaxTransactionNumber; // a variable ensuring transaction ids stay unique beyond block boundaries

// sorting block
private DecompBlockBaseClass decompBlockSortArray[];

private class DecompBlockBaseClass
 {
   public int transactionIDNumber;
   public double itemIDkey;
 }

// a comparator for the decomp sort memory structure - technically identical to the one used for global block sort
private class DecompBlockComparator implements Comparator
 {
   public int compare(Object o1, Object o2)
   {

     return (((DecompBlockBaseClass)o1).transactionIDNumber==((DecompBlockBaseClass)o2).transactionIDNumber)?0:
         (((DecompBlockBaseClass)o1).transactionIDNumber<((DecompBlockBaseClass)o2).transactionIDNumber)?-1:1;

   }//end compare()
 }//end class TheComparator


 /* *************************************************************** */
 /* **** Variables for algorithm "decomp" (floating window) ******* */
 /* *************************************************************** */

 private int decompWindowSize = 100000; // window size - initialised w/ a default, but can be changed w/ setBlockSize()
 protected boolean decompOptimize = true; // optimisation switch; just leave it as it is
 private long decompListEntries;
 private Hashtable decompHashtable;
 protected long decompTransactionIDCounter;

 private DecompListClass decompListFirstElement;
 private DecompListClass decompListLastElement;

 // list elements elements
private class DecompListClass
 {
   public long transactionIDNumber;
   public String transactionIDString;   // only used as a pointer
   public double itemIDkey;
   public DecompListClass nextElement;
 }

  // elements associated with transaction id, saved in hashtable
 private class DecompHashClass
  {
    public long transactionIDNumber;
    public long numElements;
    public DecompListClass firstElement;
    public DecompListClass lastElement;
    public String previousTransactionIDString;
  }


/* *************************************************************** */

  /**
   * Constructor. Creates and initializes preprocessor stream.
   *
   * @param file (File) name of input stream. Parameter is passed to the preprocessor.
   * @throws MiningException
   */
  public LogFileQSort(String file) throws MiningException
  {
    super();

    processedStream = new LogFileQParse(file);

    try
    {
      processedStream.open();
    } catch (MiningException ex)
    {
      throw new MiningException("Error: Cannot open file " + file);
    }

    // initialise current session
    itemIndexCurrent = 1;
    transactionIDLastSeen = -1;

    // set default sorting method
    setSortingMethod(METHOD_DECOMPBLOCK);
  }

  /**
   * Function to provide access to underlying preprocessor
   *
   * @return LogFileSequentialPreprocess (extended LogFileStream)
   */
public LogFileQParse getPreprocessor()
{
  return processedStream;
}

/**
 * Returns type of item ID attribute.
 *
 * @return type of item ID attribute
 */
public int getItemIDType() {

  return itemIDType;
}

/**
 * Sets type pf item ID attribute. By default, this is string but the
 * string can be converted to another data type (if possible)
 *
 * @param itemIDType new data type to convert the itemID strings
 */
public void setItemIDType(int itemIDType) {
logfileqsort.java - 源码说明

本页面展示了「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」中的 logfileqsort.java 源码文件，采用 Java 编程语言编写，共 1,284 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ALPHAMINERR相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?