📄 logfileqsort.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2004 Prudential Systems AG, Michael Krautmacher
* @author Michael Krautmacher (michael@krautmacher.com)
* @version 1.1
*/
/**
* An (almost) transparent derivative of a <code>MiningInputStream</code> that
* implements a sorting / grouping functionality to enable the use of
* sequential analysis of (web server) log files.
*
* The class provides a stream comprising the elements required for the
* sequential analysis algorithm and can be accessed just like a
* <code>MiningFileStream</code> based class.
*
*
* Various sorting / grouping mechanisms have been implemented to allow
* for processing of log files of _any_ size.
*
* Definitions:
* L = number of transaction-item data pairs
* T = number of transactions; T << L, but T = Theta(L) is assumed
* B = size of a processing block (constant)
*
* Algorithms:
*
* - Trivial: May only be used for very small log files due to a complexity
* of Theta(L*T), equiv Theta(L^2). Memory requirement is Theta(T).
* Because it manages a global hashtable, it's memory requirements = Theta(T)
* Note: This algorithm returns an exact result. Do not use except for testing
* with very small data sources.
*
* - Global Block: May be used for for small to medium log files.
* The complexity is O(L*log(L)). The whole datasource is loaded into memory
* for processing (This requires memory of Theta(L)).
* Note: This algorithm returns an exact result. It is suitable for processing
* of certain live data sources: The data source is read only once - however,
* there is not output (and no sorting) until a call to the undelying next()
* has returned false. This algorithm requires the highest amount of memory of
* all implementations, so be careful with large log files.
*
* - Decomposition Block: Works on data sources of _any_ size, but should be
* used on medium+ sized data sources. The complexity is Theta(L*log B)
* equiv Theta(L). B is a constant, which can be chosen virtually arbitrarily,
* but small B's (block sizes) will induce a certain amount of errors, however.
* This algorithm only uses memory of the amount Theta(B)
* Note: This algorithm does not return an exact result, if B < L (typical case)
* It can be used on any live data source provided the stream is not reset
* (i.e. no "second run" is possible)
* WARNING: There is a debug switch for evaluation of the errors induced by
* the usage of fixed size blocks. Turning debugging on will affect the memory
* usage, which will change to Theta(L+B)
*
* - Decomposition: Works on data sources of _any_ size, but should be
* used on medium+ sized data sources. The complexity is Theta(L*log B)
* equiv Theta(L). B is a constant, which can be chosen virtually arbitrarily,
* but small B's (window sizes) will induce a certain amount of errors, however.
* The technique used is a moving window and a scheduler.
* This algorithm only uses memory of the amount Theta(B).
* Note: This algorithm does not necessarily return an exact result, if B < L
* (typical case). The error rate should be far lower then the one of
* decomposition block assuming the same block size. However, the sorting
* speed is far lower as well.
* It can be used on any live data source provided the stream is not reset
* (i.e. no "second run" is possible).
* WARNING: There is a switch to turn the scheduler off. Doing so might
* affect the error rate. Always leave the scheduler on except for testing.
*/
package com.prudsys.pdm.Input.Records.Log;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
public class LogFileQSort extends MiningInputStream {
// constants for sorting algorithms to be used w/ setSortingMethod()
public static final int METHOD_TRIVIAL = 1;
public static final int METHOD_GLOBALBLOCK = 2;
public static final int METHOD_DECOMPBLOCK = 3;
public static final int METHOD_DECOMP = 4;
// the data source delivering unsorted, but preprocessed data
private LogFileQParse processedStream;
private int sortingMethod;
// The data type, the item ID string to be converted
private int itemIDType = CategoricalAttribute.STRING;
// mining vector returned by a call to the read() function
private MiningVector nextMiningVector;
private long nextTransactionID;
// for read() function - indicates wether there was a call to next() (== update) in between or not
private boolean nextMiningVectorUpdated;
// variables for item index of current (!) session
private int itemIndexCurrent;
private long transactionIDLastSeen;
//statistical variables
// time taken (in ms, time taken for the "pure" sorting, not including parsing)
public long sortTimeTaken;
public long totalTimeTaken;
/* *************************************************************** */
/* ************* Variables for algorithm "trivial" *************** */
/* *************************************************************** */
private Hashtable trivialHashtable;
private int trivialCursor;
private String trivialCurrentTransactionID;
private int trivialNumTransactions;
/* *************************************************************** */
/* ********** Variables for algorithm "global block" ************* */
/* *************************************************************** */
// initialising that variable here is sort of a dirty trick -
// a "reset" does not purge the results of previous sorting efforts
private boolean globalBlockInitialised = false;
private int globalBlockValidEntries;
private int globalBlockCursor;
// data structure that is used for sorting
private GlobalBlockBaseClass globalBlockSortArray[];
private class GlobalBlockBaseClass
{
public long transactionIDNumber;
public double itemIDkey;
}
private class GlobalBlockListClass extends GlobalBlockBaseClass
{
GlobalBlockListClass next = null;
}
// a comparator for the global sort memory structure
private class GlobalBlockComparator implements Comparator
{
public int compare(Object o1, Object o2)
{
long i1 = ((GlobalBlockBaseClass)o1).transactionIDNumber;
long i2 = ((GlobalBlockBaseClass)o2).transactionIDNumber;
if (i1==i2)
return 0;
else
return (i1<i2)?-1:1;
}//end compare()
}//end class TheComparator
/* *************************************************************** */
/* ********** Variables for algorithm "decomp block" ************* */
/* *************************************************************** */
// debugging variables:
/* *********** debugging switch ************************* */
private static final boolean decompBlockDebug = false;
/* *********** debugging switch ************************* */
// global hashtable used for debugging
private Hashtable decompBlockDebugHashtable;
// debugging statistics
protected long decompBlockDebugWrongSingle; // number of wrong (artificial) transaction
protected long decompBlockDebugWrongMulti; // number of wrong (artificial) transaction appearing in multiple blocks
// regular variables:
private boolean decompBlockInitialised;
private boolean decompBlockDataPresent; // block contains valid data
private int decompBlockBlockSize = 100000; // block size - initialised w/ a default, but can be changed w/ setBlockSize()
private int decompBlockEntries; // number of entries in block
private int decompBlockCursor; // read cursor
private int decompBlockMaxTransactionNumber; // a variable ensuring transaction ids stay unique beyond block boundaries
// sorting block
private DecompBlockBaseClass decompBlockSortArray[];
private class DecompBlockBaseClass
{
public int transactionIDNumber;
public double itemIDkey;
}
// a comparator for the decomp sort memory structure - technically identical to the one used for global block sort
private class DecompBlockComparator implements Comparator
{
public int compare(Object o1, Object o2)
{
return (((DecompBlockBaseClass)o1).transactionIDNumber==((DecompBlockBaseClass)o2).transactionIDNumber)?0:
(((DecompBlockBaseClass)o1).transactionIDNumber<((DecompBlockBaseClass)o2).transactionIDNumber)?-1:1;
}//end compare()
}//end class TheComparator
/* *************************************************************** */
/* **** Variables for algorithm "decomp" (floating window) ******* */
/* *************************************************************** */
private int decompWindowSize = 100000; // window size - initialised w/ a default, but can be changed w/ setBlockSize()
protected boolean decompOptimize = true; // optimisation switch; just leave it as it is
private long decompListEntries;
private Hashtable decompHashtable;
protected long decompTransactionIDCounter;
private DecompListClass decompListFirstElement;
private DecompListClass decompListLastElement;
// list elements elements
private class DecompListClass
{
public long transactionIDNumber;
public String transactionIDString; // only used as a pointer
public double itemIDkey;
public DecompListClass nextElement;
}
// elements associated with transaction id, saved in hashtable
private class DecompHashClass
{
public long transactionIDNumber;
public long numElements;
public DecompListClass firstElement;
public DecompListClass lastElement;
public String previousTransactionIDString;
}
/* *************************************************************** */
/**
* Constructor. Creates and initializes preprocessor stream.
*
* @param file (File) name of input stream. Parameter is passed to the preprocessor.
* @throws MiningException
*/
public LogFileQSort(String file) throws MiningException
{
super();
processedStream = new LogFileQParse(file);
try
{
processedStream.open();
} catch (MiningException ex)
{
throw new MiningException("Error: Cannot open file " + file);
}
// initialise current session
itemIndexCurrent = 1;
transactionIDLastSeen = -1;
// set default sorting method
setSortingMethod(METHOD_DECOMPBLOCK);
}
/**
* Function to provide access to underlying preprocessor
*
* @return LogFileSequentialPreprocess (extended LogFileStream)
*/
public LogFileQParse getPreprocessor()
{
return processedStream;
}
/**
* Returns type of item ID attribute.
*
* @return type of item ID attribute
*/
public int getItemIDType() {
return itemIDType;
}
/**
* Sets type pf item ID attribute. By default, this is string but the
* string can be converted to another data type (if possible)
*
* @param itemIDType new data type to convert the itemID strings
*/
public void setItemIDType(int itemIDType) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -