📄 logfilesequentialsort.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2004 Prudential Systems AG, Michael Krautmacher
* @author Michael Krautmacher (michael@krautmacher.com)
* @version 0.9 deprecated
*/
/**
*
* This is an experimental version of a sorting algorithms class to be used on
* transaction / item pairs extracted from log files with the parser
* LogFileSequentialPreprocess.
* Please note this file is not maintained and the class should NOT be used;
* if the sorting functionality is required, please use the class LogFileQSort.
*
*
*
* Description:
* An (almost) transparent derivative of a <code>MiningInputStream</code> that
* implements a sorting / grouping function to enable the use of sequential
* analysis of web server log files.
*
* The class provides a stream comprising the elements required for the
* sequential analysis algorithm and can be accessed just like a
* <code>MiningFileStream</code> based class
*
*
* Various sorting / grouping mechanisms have been implemented to allow
* for processing of log files of _any_ size.
*
*
* Definitions:
* L = number of transaction-item data pairs
* T = number of transactions; T << L, but T = Theta(L) is assumed
* B = size of a processing block (constant)
*
* Algorithms:
*
* - Trivial: May only be used for very small log files due to a complexity
* of Theta(L*T), equiv Theta(L^2).
* Because it manages a global hashtable, it's memory requirements = Theta(T)
* Note: This algorithm returns an exact result.
*
* - Global Block: May be used for for small to medium log files.
* complexity is O(L*log(L)). The whole datasource is loaded into memory
* for processing (requires memory of Theta(L))
* Note: This algorithm returns an exact result. It is suitable for processing
* of certain live data sources, even if the stream is reset
*
* - Decomposition Block: Works on data sources of _any_ size, but should be
* used on medium+ sized data sources. The complexity is Theta(L*log B)
* equiv Theta(L). B is a constant, which can be chosen virtually arbitrarily,
* but small B's (block sizes) will induce a certain amount of errors, however.
* This algorithm only uses memory of the amount Theta(B)
* Note: This algorithm does not return an exact result, if B < L (typical case)
* It can be used on any live data source provided the stream is not reset
* (i.e. no "second run" is possible)
* WARNING: There is a debug switch for evaluation of the errors induced by
* the usage of fixed size blocks. Turning debugging on will affect the memory
* usage, which will change to Theta(L+B)
*/
package com.prudsys.pdm.Input.Records.Log;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
public class LogFileSequentialSort extends MiningInputStream {
// constants for sorting algorithms to be used w/ setSortingMethod()
public static final int METHOD_TRIVIAL = 1;
public static final int METHOD_GLOBALBLOCK = 2;
public static final int METHOD_DECOMPBLOCK = 3;
public static final int METHOD_DECOMP = 4;
// the data source delivering unsorted, but preprocessed data
private LogFileSequentialPreprocess processedStream;
private int sortingMethod;
// The data type, the item ID string to be converted
private int itemIDType = CategoricalAttribute.STRING;
// mining vector returned by a call to the read() function
private MiningVector nextMiningVector;
private long nextTransactionID;
// for read() function - indicates wether there was a call to next() (== update) in between or not
private boolean nextMiningVectorUpdated;
// variables for item index of current (!) session
private int itemIndexCurrent;
private long transactionIDLastSeen;
//statistical variables
// time taken (in ms, time taken for the "pure" sorting, not including parsing)
public long sortTimeTaken;
public long totalTimeTaken;
/* *************************************************************** */
/* ************* Variables for algorithm "trivial" *************** */
/* *************************************************************** */
private Hashtable trivialHashtable;
private int trivialCursor;
private String trivialCurrentTransactionID;
private int trivialNumTransactions;
/* *************************************************************** */
/* ********** Variables for algorithm "global block" ************* */
/* *************************************************************** */
// initialising that variable here is sort of a dirty trick -
// a "reset" does not purge the results of previous sorting efforts
private boolean globalBlockInitialised = false;
private int globalBlockValidEntries;
private int globalBlockCursor;
// data structure that is used for sorting
private GlobalBlockBaseClass globalBlockSortArray[];
private class GlobalBlockBaseClass
{
public long transactionIDNumber;
public double itemIDkey;
}
// a comparator for the global sort memory structure
private class GlobalBlockComparator implements Comparator
{
public int compare(Object o1, Object o2)
{
long i1 = ((GlobalBlockBaseClass)o1).transactionIDNumber;
long i2 = ((GlobalBlockBaseClass)o2).transactionIDNumber;
if (i1==i2)
return 0;
else
return (i1<i2)?-1:1;
}//end compare()
}//end class TheComparator
/* *************************************************************** */
/* ********** Variables for algorithm "decomp block" ************* */
/* *************************************************************** */
// debugging variables:
/* *********** debugging switch ************************* */
private static final boolean decompBlockDebug = false;
/* *********** debugging switch ************************* */
// global hashtable used for debugging
private Hashtable decompBlockDebugHashtable;
// debugging statistics
protected long decompBlockDebugWrongSingle; // number of wrong (artificial) transaction
protected long decompBlockDebugWrongMulti; // number of wrong (artificial) transaction appearing in multiple blocks
// regular variables:
private boolean decompBlockInitialised;
private boolean decompBlockDataPresent; // block contains valid data
private int decompBlockBlockSize = 100000; // block size - initialised w/ a default, but can be changed w/ setBlockSize()
private int decompBlockEntries; // number of entries in block
private int decompBlockCursor; // read cursor
private int decompBlockMaxTransactionNumber; // a variable ensuring transaction ids stay unique beyond block boundaries
// sorting block
private DecompBlockBaseClass decompBlockSortArray[];
private class DecompBlockBaseClass
{
public int transactionIDNumber;
public double itemIDkey;
}
// a comparator for the decomp sort memory structure - technically identical to the one used for global block sort
private class DecompBlockComparator implements Comparator
{
public int compare(Object o1, Object o2)
{
return (((DecompBlockBaseClass)o1).transactionIDNumber==((DecompBlockBaseClass)o2).transactionIDNumber)?0:
(((DecompBlockBaseClass)o1).transactionIDNumber<((DecompBlockBaseClass)o2).transactionIDNumber)?-1:1;
}//end compare()
}//end class TheComparator
/* *************************************************************** */
/* **** Variables for algorithm "decomp" (floating window) ******* */
/* *************************************************************** */
private int decompWindowSize = 100000; // window size - initialised w/ a default, but can be changed w/ setBlockSize()
protected boolean decompOptimize = true; // optimisation switch; just leave it as it is
private long decompListEntries;
private Hashtable decompHashtable;
private long decompTransactionIDCounter;
private DecompListClass decompListFirstElement;
private DecompListClass decompListLastElement;
// list elements elements
private class DecompListClass
{
public long transactionIDNumber;
public String transactionIDString; // only used as a pointer
public double itemIDkey;
public DecompListClass nextElement;
}
// elements associated with transaction id, saved in hashtable
private class DecompHashClass
{
public long transactionIDNumber;
public DecompListClass firstElement;
public DecompListClass lastElement;
public String previousTransactionIDString;
}
/* *************************************************************** */
/**
* Constructor. Creates and initializes preprocessor stream.
*
* @param file (File) name of input stream. Parameter is passed to the preprocessor.
* @throws MiningException
*/
public LogFileSequentialSort(String file) throws MiningException
{
super();
processedStream = new LogFileSequentialPreprocess(file);
// initialize original input file stream
processedStream.recognize();
processedStream.reset();
try
{
processedStream.setTransactionIDParameters();
processedStream.setItemIDParameters();
processedStream.open();
} catch (MiningException ex)
{
System.out.println("Warning: Setting of default parameters failed. Manual setting required.");
}
// initialise current session
itemIndexCurrent = 1;
transactionIDLastSeen = -1;
// set default sorting method
setSortingMethod(METHOD_DECOMPBLOCK);
}
/**
* Function to provide access to underlying preprocessor
*
* @return LogFileSequentialPreprocess (extended LogFileStream)
*/
public LogFileSequentialPreprocess getPreprocessor()
{
return processedStream;
}
/**
* Returns type of item ID attribute.
*
* @return type of item ID attribute
*/
public int getItemIDType() {
return itemIDType;
}
/**
* Sets type pf item ID attribute. By default, this is string but the
* string can be converted to another data type (if possible)
*
* @param itemIDType new data type to convert the itemID strings
*/
public void setItemIDType(int itemIDType) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -