📄 logfileqsort.java
字号:
this.itemIDType = itemIDType;
}
/**
* Function to change the default block size of the decomposition
* block algorithm and the decomposition algorithm
*
* Note: Small blocks may induce significant errors, while
* very large blocks may reduce performance (Theta(B*log B)).
* Also, the blocks permanently occupy a respective amount of memory
*
* @param newBlockSize new block size in #(t-id,item-id) pairs
* @throws MiningException
*/
public void setBlockSize(int newBlockSize) throws MiningException
{
// here we don't care about the sorting method used,
// blocksize is uniformly valid parameter for all decomp-type methods
decompBlockBlockSize = newBlockSize;
decompWindowSize = newBlockSize;
setSortingMethod(sortingMethod);
}
/**
* Set the method used for sorting and initialise sorting method.
* <BR>See general description of <code>LogFileSequentialSort</code> class
*
* @param newSortingMethod int
* @throws MiningException
*/
public void setSortingMethod(int newSortingMethod) throws MiningException
{
sortingMethod = newSortingMethod;
// reset timers
sortTimeTaken = 0;
totalTimeTaken = 0;
switch (sortingMethod)
{
case METHOD_TRIVIAL:
trivialHashtable = new Hashtable();
trivialCursor = 1;
trivialNumTransactions = 0;
trivialCurrentTransactionID = "";
// sort time taken cannot be measured accurately, because sorting and parsing is highly interwoven
sortTimeTaken = -1;
return;
case METHOD_GLOBALBLOCK:
globalBlockCursor = 0;
return;
case METHOD_DECOMPBLOCK:
decompBlockInitialised = false;
decompBlockDataPresent = false;
decompBlockCursor = 0;
decompBlockMaxTransactionNumber = 0;
// initialise sort buffer
decompBlockSortArray = new DecompBlockBaseClass[decompBlockBlockSize];
for (int i=0;i<decompBlockBlockSize;i++)
decompBlockSortArray[i] = new DecompBlockBaseClass();
// initialise global hashtable if debugging is switched on
// and reset error counters
if(decompBlockDebug)
{
decompBlockDebugHashtable = new Hashtable();
decompBlockDebugWrongSingle = 0;
decompBlockDebugWrongMulti = 0;
}
return;
case METHOD_DECOMP:
decompListEntries = 0;
decompHashtable = new Hashtable(decompWindowSize);
decompTransactionIDCounter = 1;
// set global list pointers to null
decompListFirstElement = null;
decompListLastElement = null;
return;
}
throw new MiningException("setSortingMethod(): Unknown sorting method (newSortingMethod = " + newSortingMethod + ").");
}
/**
* The original getMetaData is overwritten, because meta data differs from original data source.<BR>
* To retrieve original meta data, call getPreprocessor().getMetaData()
*
* @throws MiningException
* @return MiningDataSpecification
*/
public MiningDataSpecification getMetaData() throws MiningException
{
return processedStream.metaDataProcessed;
}
/**
* Read() function returns a mining vector that is "in order" w/ respect to transaction id.
* Read() also applies an item index
*
* @throws MiningException
* @return MiningVector
*/
public MiningVector read() throws MiningException
{
if (!nextMiningVectorUpdated)
return nextMiningVector;
// the next time the extra labeling is not required
nextMiningVectorUpdated = false;
// check if (current) mining vector is member of the last session (transaction id equal to the one last seen)
if(transactionIDLastSeen!= nextTransactionID)
{
itemIndexCurrent = 1;
transactionIDLastSeen = nextTransactionID;
} else
itemIndexCurrent++;
// set session number
nextMiningVector.setValue(2, itemIndexCurrent);
// convert item ID type
convertItemId();
return nextMiningVector;
}
/**
* Converts item ID string to item ID type. If not possible,
* the default value (0, false, etc.) of item ID type is used.
*
* @throws MiningException cannot item ID type not supported
*/
private void convertItemId() throws MiningException {
if (itemIDType != CategoricalAttribute.STRING) {
double key = nextMiningVector.getValue(1);
CategoricalAttribute catAtt = (CategoricalAttribute)
nextMiningVector.getMetaData().getMiningAttribute(1);
Category cat = catAtt.getCategory(key);
Object catOb = null;
if ( itemIDType == CategoricalAttribute.DOUBLE )
catOb = new Double(0);
else if ( itemIDType == CategoricalAttribute.FLOAT )
catOb = new Float(0);
else if ( itemIDType == CategoricalAttribute.INTEGER )
catOb = new Integer(0);
else if ( itemIDType == CategoricalAttribute.BOOLEAN )
catOb = new Boolean(false);
else
throw new MiningException("itemID type " + itemIDType + " not supported");
try {
String cval = (String) cat.getValue();
Object co = null;
if ( itemIDType == CategoricalAttribute.DOUBLE )
co = new Double(cval);
else if ( itemIDType == CategoricalAttribute.FLOAT )
co = new Float(cval);
else if ( itemIDType == CategoricalAttribute.INTEGER )
co = new Integer(cval);
else if ( itemIDType == CategoricalAttribute.BOOLEAN )
co = new Boolean(cval);
catOb = co;
}
catch (Exception ex) {
}
Category catNew = new Category(catOb);
key = catAtt.getKey(catNew);
if (Category.isMissingValue(key))
key = catAtt.addCategory(catNew);
nextMiningVector.setValue(1, key);
}
}
/**
* Move is implemented as a loop based on next(), so there is no real performance gain using move()
*
* @param position relative amount of data sets to advance
* @throws MiningException
* @return boolean result of last call to next()
*/
public boolean move( int position ) throws MiningException
{
// check parameter (position must be >0)
if (position <=0)
throw new MiningException("Cannot move backwards or advance zero lines in file stream.");
// moving is realised using calls to next()
boolean nextResult = false;
for(int i=0;i<position;i++)
{
nextResult = next();
};
return nextResult;
}
/**
* next() implementation based on a trivial "sorting" algorithm.<BR>
* A hashtable is used to identify individual transaction ids - for
* every "new" (not previously processed) transaction id found, all
* entries containing that t-id are extracted from the data stream.
*
* @throws MiningException
* @return boolean
*/
private boolean trivialNext() throws MiningException
{
boolean haveNextVector;
MiningVector miningVector;
String transactionString;
// search loop, left through return
while(true)
{
haveNextVector = processedStream.next();
if (haveNextVector)
{
// get next mining vector and retrieve transaction id string
miningVector = processedStream.read();
transactionString = miningVector.toVector().elementAt(0).toString();
// are we working on a certain transaction so far?
if (trivialCurrentTransactionID.equals(""))
{
trivialCursor++;
// check if the current transaction is already in hash table
if (trivialHashtable.containsKey(transactionString))
{
// we know that key, so we just continue
continue;
} else
{
// found a new unique transaction
// increase transaction counter
trivialNumTransactions++;
// add transaction id to hashtable
trivialHashtable.put(transactionString, new Integer(trivialNumTransactions));
// set new "current" transaction id
trivialCurrentTransactionID = transactionString;
nextTransactionID++;
nextMiningVector = miningVector;
return true;
}
}
// current mining vector still belongs to the same session? simplest scenario - just output
if (trivialCurrentTransactionID.equals(transactionString))
{
nextMiningVector = miningVector;
return true;
}
// we stumbled over a transaction id which is different from what we are looking for
continue;
}
else // haveNextVector failed
{
// if there is no current transaction and no vector to read, we're done
if (trivialCurrentTransactionID.equals(""))
{
return false;
}
// end of file reached, but there may be work left, so go back to cursor and have a look
// delete current session
trivialCurrentTransactionID = "";
// reset data source
processedStream.reset();
// we move to trivialCursor -1, because the following call of next() also advances "cursor"
processedStream.move(trivialCursor-1);
continue;
} // haveNextVector
} // while(true)
}
/**
* next() implementation based on an algorithm that uses a single global block.<BR>
* A memory structure comprising all the relevant data of the data source is
* created und a merge sort is used for sorting purposes.<BR>
* The sorting speed is very good even for very large files,
* but the memory requirement is likely to cause trouble sooner or later (Theta(L))
*
* @throws MiningException
* @return boolean
*/
private boolean globalBlockNext() throws MiningException
{
// File needs to be read into memory
// initialisiation, including all the sorting is done in advance and only once
if (!globalBlockInitialised)
{
globalBlockInitialised = true;
GlobalBlockListClass globalBlockSortList = new GlobalBlockListClass();
GlobalBlockListClass anchor = globalBlockSortList;
// number of valid entries read (required to get buffer)
globalBlockValidEntries = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -