📄 logfileqparse.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2004 Prudential Systems AG, Michael Krautmacher
* @author Michael Krautmacher (michael@krautmacher.com)
* @version 0.9
*/
/**
* An (almost) transparent derivative of a <code>MiningInputStream</code> that
* implements a very fast, programmable parser to extract transaction / item pairs
*
* The class provides a stream comprising elements that can be sorted
* (using LogFileQSort) and used w/ the
* sequential analysis algorithm and can be accessed just like a
* <code>MiningFileStream</code> based class.
*
* Note: This class is not backward compatible with another previous
* parsing class and can not be used with the original
* LogFileSequentialSort class, but must be used in conjunction with
* LogFileQSort.
*
* Please do NOT use the previous parsing class LogFileSequentialClass.
*
*
*
* Parsing strategy: The data source is read on a per line basis using the
* standard buffered line reader. In case of success a parsing tree structure
* is traversed until both transaction and item id have been extracted or the
* parsing failed. In the latter case the next line is read and parsed.
*
* The parsing tree is a binary tree with an action associated with every node.
* In case of success (execution on node's action), the algorithm continues
* with the left ("next") child node, otherwise it goes on with the right
* ("alternative") child node, so it's effectively a recursive DFS algorithm.
* When a leaf node is reached ("next"==null), the parsing action was successful
* (either transaction id or item id extracted).
* When both transaction and item id are extracted, the overall parsing process
* was succesful and the result is returned. In case the algorithm reaches
* the ultimate ("alternative"==null), the parsing failed.
*/
package com.prudsys.pdm.Input.Records.Log;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Vector;
import org.apache.oro.text.perl.Perl5Util;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
public class LogFileQParse extends MiningInputStream {
// statistical variables
protected int numLinesInvalid;
protected int numLinesCount;
// original variables cannot be overwritten, because they are required for this class
protected MiningDataSpecification metaDataProcessed;
protected MiningVector cursorVectorProcessed;
// mining attributes of concern; globals to save on time for looking those up constantly
private MiningAttribute attributeTransactionID;
private boolean haveTransaction;
private MiningAttribute attributeItemID;
private boolean haveItem;
// desired components parsed from the data
private String qParseItemString;
private String qParseTransactionString;
// File reader objects
private String logFileName;
private BufferedReader bufferedReader;
private boolean logFileIsOpen = false;
// processor
private Perl5Util perl;
private static final LogFileQParseAction ACTION_DONE = null;
public static final int PARSE_PERL_MATCH = 1;
public static final int PARSE_PERL_MATCHEXTRACT = 2;
public static final int PARSE_PERL_NEGMATCH = 4;
public static final int PARSE_EXTRACT = 8;
public static final int PARSE_PREFIX = 16;
// base class for parsing tree
public class LogFileQParseAction
{
int actionType;
boolean isTransaction;
boolean isItem;
LogFileQParseAction nextAction;
LogFileQParseAction alternativeAction;
}
// special class for parsing using perl utils
private class LogFileQParseActionPerl extends LogFileQParseAction
{
String configString;
}
// special class for parsing using simple character search
private class LogFileQParseActionExtract extends LogFileQParseAction
{
char separator1;
int count1;
char separator2;
int count2;
}
// special class for parsing using simple character search
private class LogFileQParseActionPrefixPostfix extends LogFileQParseAction
{
char separator;
}
// this is the very important anchor to the parse-tree
private LogFileQParseAction parseAnchor;
/**
* Constructor calls constructor of superclass and initialises variables
* Main task is the creation of meta data (fixed format is used internally)
*
* @param fileName String
* @throws MiningException
*/
public LogFileQParse(String fileName) throws MiningException
{
super();
numLinesInvalid = 0;
numLinesCount = 0;
logFileName = fileName;
// generate "artificial" mining specification
metaDataProcessed = new MiningDataSpecification();
metaDataProcessed.setRelationName( super.getName() + " (sequencial analysis stream)" );
CategoricalAttribute attribute1 = new CategoricalAttribute( "transactionId" );
metaDataProcessed.addMiningAttribute( attribute1 );
attributeTransactionID = metaDataProcessed.getMiningAttribute("transactionId");
CategoricalAttribute attribute2 = new CategoricalAttribute( "itemId" );
metaDataProcessed.addMiningAttribute( attribute2 );
attributeItemID = metaDataProcessed.getMiningAttribute("itemId");
NumericAttribute attribute3 = new NumericAttribute("itemIndex");
attribute3.setDataType(NumericAttribute.INTEGER);
attribute3.setLowerBound( 0 );
metaDataProcessed.addMiningAttribute( attribute3 );
// no parsing directives so far
parseAnchor = null;
perl = new Perl5Util();
}
/**
* Sets cursor position before first data set
* Executes reset of superclass and resets statistics
*
* @throws MiningException operation failed
*/
public void reset() throws MiningException
{
try
{
if (logFileIsOpen)
close();
open();
}
catch( MiningException ex )
{
throw new MiningException( "qParse: reset() failed with message " + ex.getMessage() );
}
}
/**
* Open mining file stream. This method can be left.
*
* @exception MiningException if a mining source access error occurs
*/
public void open() throws MiningException
{
if (logFileIsOpen)
{
reset();
return;
}
try
{
bufferedReader = new BufferedReader(new FileReader(logFileName),
1024 * 1024);
} catch (Exception e)
{
throw new MiningException("Can't open file " + logFileName + ".");
}
logFileIsOpen = true;
numLinesInvalid = 0;
numLinesCount = 0;
}
/**
* Close mining file stream by closing reader.
*
* @exception MiningException if a mining source access error occurs
*/
public void close() throws MiningException
{
try
{
bufferedReader.close();
logFileIsOpen = false;
}
catch( IOException ex)
{
throw new MiningException( "Can't close reader.");
};
}
// -----------------------------------------------------------------------
// Getter and setter methods
// -----------------------------------------------------------------------
/**
* Returns supported stream methods (only special function supported is reset()).
*
* @return supported stream methods
*/
public Enumeration getSupportedStreamMethods()
{
Vector suppmeth = new Vector();
suppmeth.addElement("reset");
return suppmeth.elements();
}
/**
* Returns the CWM mapping from the physical to the logical data model.
*
* @return transformation of physical to logical data model
* @throws MiningException couldn't get transformation
*/
public org.omg.cwm.analysis.transformation.TransformationMap getPhysicalToLogicalModelTransformation()
throws MiningException
{
throw new MiningException("This class does not support getPhysicalToLogicalModelTransformation()");
}
/**
* Finds physical file model (CWM Resource Package "Record").
*
* @exception MiningException couldn't obtain physical model
*/
public void findPhysicalModel() throws MiningException
{
throw new MiningException("This class does not support findPhysicalModel()");
}
/**
* Recognize the input stream data specification.
*
* @return the MiningDataSpecification
* @exception MiningException always thrown
*/
public MiningDataSpecification recognize() throws MiningException
{
throw new MiningException( "This class does not support recognize()" );
}
/**
* changes setting for transaction id attribute (stored / unstored attribute)
* unstored makes sense for decomposition algorithms
*
* @param setting boolean
*/
public void setTransactionIDUnstoredCategory(boolean setting)
{
((CategoricalAttribute)metaDataProcessed.getMiningAttribute("transactionId")).setUnstoredCategories(setting);
}
/**
* Creates a stripped down version of the input stream that comprises the
* elements required for sequential analysis, only.
*
* @param path Path for new file
* @param doWrite The information extracted is written to the file specified. This can be turned off for usage with systems w/out req. permissions for testing.
* @throws MiningException cannot create file
*/
public void dump( String path, boolean doWrite ) throws MiningException
{
open();
// counter for statistical
numLinesInvalid = 0;
numLinesCount = 0;
try
{
FileWriter writer = new FileWriter( path );
BufferedWriter buffer = new BufferedWriter( writer, 524288 );
// analog to "metaData.createArffDescription()":
// create header for arff file
String description = "@relation 'SequentialAnalysisFile'\n";
description = description + "@attribute transactionId string\n@attribute itemId string\n@attribute itemIndex real\n";
buffer.write( description + "\n" );
buffer.write( "@data" + "\n" );
MiningVector miningVector;
while( this.next() )
{
miningVector = this.read();
if (doWrite)
buffer.write( miningVector.toString() + "\n" );
}
buffer.close();
}
catch (IOException ex)
{
throw new MiningException( ex.getMessage() );
}
System.out.println("Lines total: " + numLinesCount + ", invalid: " + numLinesInvalid + ".");
}
private String qParseExecutePerlMatch(String toParse, String regEx)
{
return perl.match(regEx, toParse)?toParse:null;
}
private String qParseExecutePerlMatchExtract(String toParse, String regEx)
{
if (perl.match(regEx, toParse))
return perl.getMatch().toString();
else
return null;
}
private String qParseExecuteExtract(String toParse, int count1, char sepchar1, int count2, char sepchar2)
{
// just a quick and dirty hack
// implement something useful as soon as the framework is complete
int len = toParse.length();
char[] charArray = toParse.toCharArray();
int c1 = 0; int m = 0;
while (c1 < len)
{
if (charArray[c1]==sepchar1)
m++;
if (m==count1)
break;
c1++;
}
if (c1 >= len)
return null;
int c2 = c1+1; m = 0;
while (c2 < len)
{
if (charArray[c2]==sepchar2)
m++;
if (m==count2)
break;
c2++;
}
if (c2 >= len)
return null;
return toParse.substring(c1+1,c2);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -