📄 logfilerecognizer.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2004 Prudential Systems AG
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Valentine Stepanenko (valentine.stepanenko@zsoft.ru)
* @author Michael Krautmacher (michael@krautmacher.com)
* @version 1.2
*/
package com.prudsys.pdm.Input.Records.Log;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import org.apache.oro.text.MatchAction;
import org.apache.oro.text.MatchActionInfo;
import org.apache.oro.text.MatchActionProcessor;
import org.apache.oro.text.perl.Perl5Util;
import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.NumericAttribute;
/**
* Main class to recognize log files. There are typically two types of log files:
* self-explaining like Extended or Shop logfile formats and non self-explaining
* like the NCSA or IIS logfile formats. The last ones require a format
* description. <p>
*
* Such description is given in the private 'format' variable in terms
* of regular expressions. The 'format' variable can be easily extended
* for other logfile types or loaded from a separat 'meta-logfile'.
*/
public class LogFileRecognizer extends File
{
/**
*
*/
private static final long serialVersionUID = -2734749069172543241L;
// Constants
public static final int ZSoft_Meta_Log_File_Format = -1;
public static final int Extended_Log_File_Format = 0;
public static final int IIS_Log_File_Format = 1;
public static final int Intershop_Log_File_Format = 2;
public static final int NCSA_Combined_Log_File_Format = 3;
public static final int NCSA_Common_Log_File_Format = 4;
public static final int Shop_Log_File_Format = 5;
public static final int NO_DIRECTIVES_LOG_FILE_FORMAT = 100;
public static final int UNRECOGNIZED = 101;
public static final String ZSOFT_META_LOG_FILE_FORMAT_FIELDNAME = "pattern";
public static final String SHOP_LOG_FILE_FORMAT_FIELDNAME = "cs-customer-id";
public static final String DIRECTIVE_FIELDS_EXTENDED_LOG_FILE_FORMAT = "Fields";
/** Description of logfile formats that are not self-explaining. */
private String format[][] =
{
{ "Microsoft IIS Log File Format", "([\\w?.:/_+-]+, ){14}", "/, /", "c-ip cs-username date time s-sitename s-computername s-ip time-taken cs-bytes sc-bytes sc-status sc-win32-status cs-method cs-uri-stem"},
{ "NCSA Combined Log File Format", "\"GET [\\w?.:/_+-]+ HTTP[\\w?.:/_+-]+\" [\\w?.:/_+-]+ [\\w?.:/_+-]+ \"[\\w?.:/_+-]+\" ", "/\" \"|] \"|] | \\[|\" | \"| /", "c-ip cs-logname cs-username date time cs-method cs-uri-stem cs-version sc-status sc-bytes cs-referer cs-user-agent"},
{ "NCSA Common Log File Format", "\"GET [\\w?.:/_+-]+ HTTP[\\w?.:/_+-]+\"", "/ \\[|] \"|\" | /", "c-ip cs-logname cs-username date time cs-method cs-uri-stem cs-version sc-status sc-bytes" },
{ "Intershop Log File Format", "([\\w?.:/_+-]+\\|){3}", "/\\x7C/", "time c-ip cs-username s-computername s-port time-taken cs-uri cs-user-agent cs-cookie cs-referer cs-sid"}
};
private int logFileType = UNRECOGNIZED; // Log file type
private String description = "Unrecognized"; // Log file type description
private int bytesToRecognize = 5120; // How many bytes required to recognize log file
private String regularExpr;
@SuppressWarnings("unused")
private Pattern pattern;
private String logFileSeparator;
private PatternCompiler compiler;
@SuppressWarnings("unused")
private PatternMatcher matcher;
private Perl5Util perl;
private Hashtable logFileDirectives; //all w3c directives extracted from log file header
private int logFileCounter;
private int logFileUnrecognizedFields = 15; // log file fields for unrecognized file
private String fieldsNames;
/**
* Default constructor.
* <p>
* @param pathname LogFile name.
* @throws MalformedPatternException Throws if there are some problems while recognizing a log file.
* @throws IOException Throws if there are some problems while readind from log file.
*/
public LogFileRecognizer( String pathname ) throws IOException, MalformedPatternException
{
super( pathname );
if( pathname == null )
{
throw new IOException( "File name can't be NULL!" );
}
if( ! exists() )
{
throw new IOException( "File " + pathname + " does not exists!" );
}
if( ! canRead() )
{
throw new IOException( "Can not read file " + pathname + "!" );
};
logFileDirectives = new Hashtable();
perl = new Perl5Util();
compiler = new Perl5Compiler(); // Create Perl5Compiler and Perl5Matcher instances.
matcher = new Perl5Matcher();
// Attempt to compile the pattern. If the pattern is not valid, report the error and exit.
pattern = compiler.compile( SHOP_LOG_FILE_FORMAT_FIELDNAME );
}
/**
* Recognizes log file type.
* <p>
* @return log file type
* @exception IOException can't read from logfile
* @exception MalformedPatternException if there are some problems while recognizing a log file.
*/
public int recognize() throws IOException, MalformedPatternException
{
Hashtable logFileDirectives = buildLogFileDirectives( this );
logFileType = checkLogFileDirectives( logFileDirectives );
if( logFileType == NO_DIRECTIVES_LOG_FILE_FORMAT )
{
logFileType = UNRECOGNIZED;
for( int i = 0; i < format.length; i++ )
{
if( checkSpecificPatternsInLogFile( format[i][1], this ) )
{
logFileType = i;
logFileSeparator = format[i][2];
fieldsNames = format[i][3];
description = format[i][0];
break;
}
}
if( logFileType == UNRECOGNIZED )
{
logFileSeparator = "/[ ]+/";
fieldsNames = buildLogFileFieldsUnrecognized();
description = "Unrecognized";
}
}
return logFileType;
}
/**
* Set how many fields may be maximally be available in the unrecignized log file.
* <p>
* @param number Fields number.
*/
public void setLogFileUnrecognizedFields( int number )
{
logFileUnrecognizedFields = number;
}
/**
* Set how many bytes are needed to read to recognize log file.
* <p>
* @param bytes Bytes to read.
*/
public void setBytesToRecognize( int bytes )
{
bytesToRecognize = bytes;
}
/**
* Returns description of logfile type.
*
* @return description of logfile type
*/
public String getDescription()
{
return description;
}
/**
* Get this log file name without extension.
* <p>
* @return String instance containing log file name.
*/
public String getLogFileNameNoExtension() {
String name;
String logFileName = getName();
int i = logFileName.indexOf( "." );
if( i != -1 )
{
name = logFileName.substring( 0, i );
}
else
{
name = logFileName;
}
if( name.length() == 0 )
{
name = "log";
}
return name;
}
/**
* Get log file separator.
* <p>
* @return String instance containing log file separator.
*/
public String getLogFileSeparator()
{
return logFileSeparator;
}
/**
* Read log file header if it exists and creates hastable where each
* key is directive name.
* <p>
* @param file Log file name.
* @return Hashtable instance containing log file directives and their values
* @exception IOException IO error while reading logfile header
* @exception MalformedPatternException pattern error
*/
private Hashtable buildLogFileDirectives( File file ) throws IOException, MalformedPatternException
{
MatchActionProcessor processor = new MatchActionProcessor();
// we are trying to find directives according to extended log file format
// each directive has format "#DirectiveName: DirectiveValue"
// so, we use next awk regular expression "^#\w*:"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -