📄 logfilesequentialpreprocess.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2004 Prudential Systems AG, Michael Krautmacher
* @author Michael Krautmacher (michael@krautmacher.com)
* @version 0.9 deprecated
*/
/**
*
* This is an experimental version of a parsing class to be used on
* transaction / item pairs extracted from log files with the parser
* LogFileSequentialPreprocess.
* This class is based on another preprocessor and a log file recogniser class
* (LogFileStream and LogFileRecogniser).
*
* Important note:
* This file is not maintained and the class should NOT be used.
* If the sorting functionality is required, please use the class LogFileQParse.
*
*/
package com.prudsys.pdm.Input.Records.Log;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Vector;
import org.apache.oro.text.perl.Perl5Util;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningVector;
public class LogFileSequentialPreprocess extends LogFileStream {
public static int DEFAULT_FIELD_NUM = -1;
public static String DEFAULT_VEC_EXTRACT_ITEM_ID[] = {"/ProductID=/","/\\&/"};
public static String DEFAULT_VEC_EXTRACT_TRANSACTION_ID[] = {"/\\$sid\\$/","/\\?/"};
public static String DEFAULT_VEC_EXTRACT_NONE[] = {"",""};
public static String FIELD_NAME_URI = "cs-uri";
public static String FIELD_NAME_URI_STEM = "cs-uri-stem";
public static String FIELD_NAME_URI_QUERY = "cs-uri-query";
// statistical variables
protected int numLinesInvalid;
protected int numLinesCount;
// original variables cannot be overwritten, because they are required for this class
protected MiningDataSpecification metaDataProcessed;
protected MiningVector cursorVectorProcessed;
// position of required fields in the mining vector (for fast access)
// transaction id and product id may be encoded in two seperate attributes
private int vecNumTransactionID;
private int vecNumTransactionID2;
private int vecNumItemID;
private int vecNumItemID2;
private Vector vecExtractItemIDX[]; // advanced filter strings for more formats
private Vector vecExtractTransactionIDX[]; // advanced filter strings for more formats
private int vecExtractNumX; // format counter
private Perl5Util perl;
/**
* Constructor calls constructor of superclass and initialises variables
* Main task ist creation of meta data
*
* @param file String
*/
public LogFileSequentialPreprocess(String file)
{
super(file);
perl = new Perl5Util();
vecNumItemID = DEFAULT_FIELD_NUM;
vecNumItemID2 = DEFAULT_FIELD_NUM;
vecNumTransactionID = DEFAULT_FIELD_NUM;
vecNumTransactionID2 = DEFAULT_FIELD_NUM;
vecExtractItemIDX = new Vector[100];
vecExtractItemIDX[0] = new Vector();
vecExtractItemIDX[0].add(DEFAULT_VEC_EXTRACT_ITEM_ID[0]);
vecExtractItemIDX[0].add(DEFAULT_VEC_EXTRACT_ITEM_ID[1]);
vecExtractTransactionIDX = new Vector[100];
vecExtractTransactionIDX[0] = new Vector();
vecExtractTransactionIDX[0].add(DEFAULT_VEC_EXTRACT_TRANSACTION_ID[0]);
vecExtractTransactionIDX[0].add(DEFAULT_VEC_EXTRACT_TRANSACTION_ID[1]);
vecExtractNumX = 0;
numLinesInvalid = 0;
numLinesCount = 0;
// generate "artificial" mining specification
metaDataProcessed = new MiningDataSpecification();
metaDataProcessed.setRelationName( super.getName() + " (sequencial analysis stream)" );
CategoricalAttribute attribute1 = new CategoricalAttribute( "transactionId" );
// --> for decomposition alg. unstored!
// attribute1.setUnstoredCategories(true); --> set to unstored manually or use interface function setTransactionIDUnstoredCategory() to switch to unstored (i.e. for decomposition algs)
metaDataProcessed.addMiningAttribute( attribute1 );
CategoricalAttribute attribute2 = new CategoricalAttribute( "itemId" );
metaDataProcessed.addMiningAttribute( attribute2 );
NumericAttribute attribute3 = new NumericAttribute("itemIndex");
attribute3.setDataType(NumericAttribute.INTEGER);
attribute3.setLowerBound( 0 );
metaDataProcessed.addMiningAttribute( attribute3 );
}
/**
* Sets cursor position before first data set
* Executes reset of superclass and resets statistics
*
* @throws MiningException operation failed
*/
public void reset() throws MiningException
{
super.reset();
numLinesInvalid = 0;
numLinesCount = 0;
}
/**
* changes setting for transaction id attribute (stored / unstored attribute)
*
* @param setting boolean
*/
public void setTransactionIDUnstoredCategory(boolean setting)
{
((CategoricalAttribute)metaDataProcessed.getMiningAttribute("transactionId")).setUnstoredCategories(setting);
}
/**
* Extracts the field number of the transaction ID (session ID) from the mining vector
* This is the "automatic" version, which "autodetects" the session ID
*
* @throws MiningException
*/
public void setTransactionIDParameters() throws MiningException
{
if (vecNumTransactionID==DEFAULT_FIELD_NUM)
{
// check for sid ("uri" field)
if ( metaData.getMiningAttribute(FIELD_NAME_URI) != null) {
// get index number, so values can accessed directly
vecNumTransactionID = metaData.getAttributeIndex(metaData.getMiningAttribute(FIELD_NAME_URI));
}
// check for uri-query
else
if ( metaData.getMiningAttribute(FIELD_NAME_URI_QUERY) != null) {
// get index number, so values can accessed directly
vecNumTransactionID = metaData.getAttributeIndex(metaData.getMiningAttribute(FIELD_NAME_URI_QUERY));
}
else
if ( metaData.getMiningAttribute(FIELD_NAME_URI_STEM) != null) {
// get index number, so values can accessed directly
vecNumTransactionID = metaData.getAttributeIndex(metaData.getMiningAttribute(FIELD_NAME_URI_STEM));
}
else
// no useful attribute found ...
throw new MiningException(
"Cannot extract transaction ID from meta data.");
}
}
/**
* Set the field number of the transaction ID (session ID) from the mining vector
*
* @param vecNum field number of transaction ID in mining vector; can be set to "DEFAULT_FIELD_NUM" for autodetection
* @param vecNum2 second field number
* Can be set to "DEFAULT_FIELD_NUM" to use the default expression
*
* @throws MiningException
*/
public void setTransactionIDParameters(int vecNum, int vecNum2) throws MiningException
{
vecNumTransactionID = vecNum;
vecNumTransactionID2 = vecNum2;
setTransactionIDParameters();
}
/**
* Add an additional parsing format
*
* @param vecTransactionExtract String[]
* @param vecItemExtract String[]
* @throws MiningException
*/
public void addExtractIDParameters(String vecTransactionExtract[], String vecItemExtract[]) throws MiningException
{
vecExtractTransactionIDX[vecExtractNumX] = new Vector();
vecExtractItemIDX[vecExtractNumX] = new Vector();
Perl5Util validateExtract = new Perl5Util();
// copy data from string[] to vector
// ... maybe there's a more elegant solution...
int i=0;
try
{
for (;;i++)
{
vecExtractTransactionIDX[vecExtractNumX].add(vecTransactionExtract[i]);
if (!vecTransactionExtract[i].equals(""))
validateExtract.match(vecTransactionExtract[i], "");
}
}
catch (ArrayIndexOutOfBoundsException ex)
{
if (i<2)
throw new MiningException ("Invalid parameter vecTransactionExtract[] passed to function addExtractIDParameters(). Error encountered at array position " + i + ".");
if (i>4)
throw new MiningException ("Invalid parameter vecTransactionExtract[] passed to function addExtractIDParameters(). Too many entries (" + i + " found, 2-4 are valid).");
}
// copy data from string[] to vector
// ... maybe there's a more elegant solution...
try
{
for (i=0;;i++)
{
vecExtractItemIDX[vecExtractNumX].add(vecItemExtract[i]);
if (!vecTransactionExtract[i].equals(""))
validateExtract.match(vecItemExtract[i], "");
}
}
catch (ArrayIndexOutOfBoundsException ex)
{
if (i<2)
throw new MiningException ("Invalid parameter vecExtractItemIDX[] passed to function addExtractIDParameters(). Error encountered at array position " + i + ".");
if (i>4)
throw new MiningException ("Invalid parameter vecExtractItemIDX[] passed to function addExtractIDParameters(). Too many entries (" + i + " found, 2-4 are valid).");
}
vecExtractNumX++;
}
/**
* Extracts the transaction ID (typically from the uri / uri-query) from the mining vector
*
* @param miningVector mining vector
* @param parseParemNum parameter string set number
* @return itemID
*
* @throws MiningException
*/
private String extractTransactionID(MiningVector miningVector, int parseParemNum) throws MiningException
{
// extract string containing the transaction id from the mining vector
String sidString = extractBaseString(miningVector, vecNumTransactionID, vecNumTransactionID2);
// further parsing necessary?
if (vecExtractTransactionIDX[parseParemNum].elementAt(0).equals(DEFAULT_VEC_EXTRACT_NONE[0]))
return sidString;
// "kill" filter active?
if(vecExtractTransactionIDX[parseParemNum].size()>2)
if ((perl.match((String)vecExtractTransactionIDX[parseParemNum].elementAt(2), sidString)))
throw new MiningException("Note: Transaction ID was filtered.");
// extract the transaction id from the string
if (!(perl.match((String)vecExtractTransactionIDX[parseParemNum].elementAt(0), sidString)))
throw new MiningException("Warning: Missing transaction id in log file.");
Vector tmpVec = new Vector(3);
perl.split(tmpVec, (String)vecExtractTransactionIDX[parseParemNum].elementAt(0), sidString);
sidString = tmpVec.elementAt(1).toString();
if (((String)vecExtractTransactionIDX[parseParemNum].elementAt(1)).equals(DEFAULT_VEC_EXTRACT_NONE[1]))
return sidString;
Vector tmpVec2 = new Vector(3);
perl.split(tmpVec2, (String)vecExtractTransactionIDX[parseParemNum].elementAt(1), tmpVec.elementAt(1).toString());
sidString = tmpVec2.elementAt(0).toString();
// "match" filter active?
if(vecExtractTransactionIDX[parseParemNum].size()>3)
if (!(perl.match((String)vecExtractTransactionIDX[parseParemNum].elementAt(3), sidString)))
throw new MiningException("Note: Transaction ID has invalid format.");
return sidString;
}
/**
* Extracts the field number of the item ID from the mining vector
* This is the "automatic" version, which "autodetects" the item ID
*
* @throws MiningException
*/
public void setItemIDParameters() throws MiningException
{
// if not already initialized
if (vecNumItemID==DEFAULT_FIELD_NUM)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -