⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 logfilesequentialpreprocess.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

 /**
  * Title: XELOPES Data Mining Library
  * Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
  * Copyright: Copyright (c) 2004 Prudential Systems AG, Michael Krautmacher
  * @author Michael Krautmacher (michael@krautmacher.com)
  * @version 0.9 deprecated
  */

 /**
 *
 * This is an experimental version of a parsing class to be used on
 * transaction / item pairs extracted from log files with the parser
 * LogFileSequentialPreprocess.
 * This class is based on another preprocessor and a log file recogniser class
 * (LogFileStream and LogFileRecogniser).
 *
 * Important note:
 * This file is not maintained and the class should NOT be used.
 * If the sorting functionality is required, please use the class LogFileQParse.
 *
 */


package com.prudsys.pdm.Input.Records.Log;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Vector;

import org.apache.oro.text.perl.Perl5Util;

import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningVector;

public class LogFileSequentialPreprocess extends LogFileStream {

  public static int DEFAULT_FIELD_NUM = -1;

  public static String DEFAULT_VEC_EXTRACT_ITEM_ID[] = {"/ProductID=/","/\\&/"};
  public static String DEFAULT_VEC_EXTRACT_TRANSACTION_ID[] = {"/\\$sid\\$/","/\\?/"};

  public static String DEFAULT_VEC_EXTRACT_NONE[] = {"",""};

  public static String FIELD_NAME_URI = "cs-uri";
  public static String FIELD_NAME_URI_STEM = "cs-uri-stem";
  public static String FIELD_NAME_URI_QUERY = "cs-uri-query";

  // statistical variables
  protected int numLinesInvalid;
  protected int numLinesCount;

  // original variables cannot be overwritten, because they are required for this class
  protected MiningDataSpecification metaDataProcessed;
  protected MiningVector cursorVectorProcessed;

  // position of required fields in the mining vector (for fast access)
  // transaction id and product id may be encoded in two seperate attributes
  private int vecNumTransactionID;
  private int vecNumTransactionID2;
  private int vecNumItemID;
  private int vecNumItemID2;

  private Vector vecExtractItemIDX[]; // advanced filter strings for more formats
  private Vector vecExtractTransactionIDX[]; // advanced filter strings for more formats
  private int vecExtractNumX; // format counter

  private Perl5Util perl;

  /**
   * Constructor calls constructor of superclass and initialises variables
   * Main task ist creation of meta data
   *
   * @param file String
   */

  public LogFileSequentialPreprocess(String file)
  {
    super(file);

    perl = new Perl5Util();

    vecNumItemID = DEFAULT_FIELD_NUM;
    vecNumItemID2 = DEFAULT_FIELD_NUM;
    vecNumTransactionID = DEFAULT_FIELD_NUM;
    vecNumTransactionID2 = DEFAULT_FIELD_NUM;

    vecExtractItemIDX = new Vector[100];
    vecExtractItemIDX[0] = new Vector();
    vecExtractItemIDX[0].add(DEFAULT_VEC_EXTRACT_ITEM_ID[0]);
    vecExtractItemIDX[0].add(DEFAULT_VEC_EXTRACT_ITEM_ID[1]);
    vecExtractTransactionIDX = new Vector[100];
    vecExtractTransactionIDX[0] = new Vector();
    vecExtractTransactionIDX[0].add(DEFAULT_VEC_EXTRACT_TRANSACTION_ID[0]);
    vecExtractTransactionIDX[0].add(DEFAULT_VEC_EXTRACT_TRANSACTION_ID[1]);
    vecExtractNumX = 0;

    numLinesInvalid = 0;
    numLinesCount = 0;

    // generate "artificial" mining specification
    metaDataProcessed =  new MiningDataSpecification();
    metaDataProcessed.setRelationName( super.getName() + " (sequencial analysis stream)" );

    CategoricalAttribute attribute1 = new CategoricalAttribute( "transactionId" );
// --> for decomposition alg. unstored!
//    attribute1.setUnstoredCategories(true); --> set to unstored manually or use interface function setTransactionIDUnstoredCategory() to switch to unstored (i.e. for decomposition algs)
    metaDataProcessed.addMiningAttribute( attribute1 );

    CategoricalAttribute attribute2 = new CategoricalAttribute( "itemId" );
    metaDataProcessed.addMiningAttribute( attribute2 );

    NumericAttribute attribute3 = new NumericAttribute("itemIndex");
    attribute3.setDataType(NumericAttribute.INTEGER);
    attribute3.setLowerBound( 0 );
    metaDataProcessed.addMiningAttribute( attribute3 );
  }

  /**
   * Sets cursor position before first data set
   * Executes reset of superclass and resets statistics
   *
   * @throws MiningException operation failed
   */
  public void reset() throws MiningException
  {
    super.reset();

    numLinesInvalid = 0;
    numLinesCount = 0;
  }

  /**
   * changes setting for transaction id attribute (stored / unstored attribute)
   *
   * @param setting boolean
   */
  public void setTransactionIDUnstoredCategory(boolean setting)
{
    ((CategoricalAttribute)metaDataProcessed.getMiningAttribute("transactionId")).setUnstoredCategories(setting);
}


  /**
   * Extracts the field number of the transaction ID (session ID) from the mining vector
   * This is the "automatic" version, which "autodetects" the session ID
   *
   * @throws MiningException
   */
  public void setTransactionIDParameters() throws MiningException
  {
    if (vecNumTransactionID==DEFAULT_FIELD_NUM)
    {
      // check for sid ("uri" field)
      if ( metaData.getMiningAttribute(FIELD_NAME_URI) != null) {

        // get index number, so values can accessed directly
        vecNumTransactionID = metaData.getAttributeIndex(metaData.getMiningAttribute(FIELD_NAME_URI));
      }
      // check for uri-query
      else
      if ( metaData.getMiningAttribute(FIELD_NAME_URI_QUERY) != null) {

        // get index number, so values can accessed directly
        vecNumTransactionID = metaData.getAttributeIndex(metaData.getMiningAttribute(FIELD_NAME_URI_QUERY));
      }
      else
      if ( metaData.getMiningAttribute(FIELD_NAME_URI_STEM) != null) {

        // get index number, so values can accessed directly
        vecNumTransactionID = metaData.getAttributeIndex(metaData.getMiningAttribute(FIELD_NAME_URI_STEM));
      }
      else
        // no useful attribute found ...
        throw new MiningException(
            "Cannot extract transaction ID from meta data.");
    }
  }


  /**
   * Set the field number of the transaction ID (session ID) from the mining vector
   *
   * @param vecNum field number of transaction ID in mining vector; can be set to "DEFAULT_FIELD_NUM" for autodetection
   * @param vecNum2 second field number
   * Can be set to "DEFAULT_FIELD_NUM" to use the default expression
   *
   * @throws MiningException
   */
  public void setTransactionIDParameters(int vecNum, int vecNum2) throws MiningException
  {
    vecNumTransactionID = vecNum;
    vecNumTransactionID2 = vecNum2;
    setTransactionIDParameters();
  }

  /**
   * Add an additional parsing format
   *
   * @param vecTransactionExtract String[]
   * @param vecItemExtract String[]
   * @throws MiningException
   */
  public void addExtractIDParameters(String vecTransactionExtract[], String vecItemExtract[]) throws MiningException
  {
    vecExtractTransactionIDX[vecExtractNumX] = new Vector();
    vecExtractItemIDX[vecExtractNumX] = new Vector();

    Perl5Util validateExtract = new Perl5Util();

    // copy data from string[] to vector
    // ... maybe there's a more elegant solution...
    int i=0;
    try
    {
      for (;;i++)
      {
        vecExtractTransactionIDX[vecExtractNumX].add(vecTransactionExtract[i]);
        if (!vecTransactionExtract[i].equals(""))
           validateExtract.match(vecTransactionExtract[i], "");
      }
    }
    catch (ArrayIndexOutOfBoundsException ex)
    {
      if (i<2)
        throw new MiningException ("Invalid parameter vecTransactionExtract[] passed to function addExtractIDParameters(). Error encountered at array position " + i + ".");
      if (i>4)
        throw new MiningException ("Invalid parameter vecTransactionExtract[] passed to function addExtractIDParameters(). Too many entries (" + i + " found, 2-4 are valid).");
    }

    // copy data from string[] to vector
    // ... maybe there's a more elegant solution...
    try
    {
      for (i=0;;i++)
      {
        vecExtractItemIDX[vecExtractNumX].add(vecItemExtract[i]);
        if (!vecTransactionExtract[i].equals(""))
          validateExtract.match(vecItemExtract[i], "");
      }
    }
    catch (ArrayIndexOutOfBoundsException ex)
    {
      if (i<2)
        throw new MiningException ("Invalid parameter vecExtractItemIDX[] passed to function addExtractIDParameters(). Error encountered at array position " + i + ".");
      if (i>4)
        throw new MiningException ("Invalid parameter vecExtractItemIDX[] passed to function addExtractIDParameters(). Too many entries (" + i + " found, 2-4 are valid).");
    }

    vecExtractNumX++;
  }


  /**
   * Extracts the transaction ID (typically from the uri / uri-query) from the mining vector
   *
   * @param miningVector mining vector
   * @param parseParemNum parameter string set number
   * @return itemID
   *
   * @throws MiningException
   */
  private String extractTransactionID(MiningVector miningVector, int parseParemNum) throws MiningException
  {
    // extract string containing the transaction id from the mining vector
    String sidString = extractBaseString(miningVector, vecNumTransactionID, vecNumTransactionID2);

    // further parsing necessary?
    if (vecExtractTransactionIDX[parseParemNum].elementAt(0).equals(DEFAULT_VEC_EXTRACT_NONE[0]))
      return sidString;

    // "kill" filter active?
    if(vecExtractTransactionIDX[parseParemNum].size()>2)
      if ((perl.match((String)vecExtractTransactionIDX[parseParemNum].elementAt(2), sidString)))
         throw new MiningException("Note: Transaction ID was filtered.");

    // extract the transaction id from the string
    if (!(perl.match((String)vecExtractTransactionIDX[parseParemNum].elementAt(0), sidString)))
       throw new MiningException("Warning: Missing transaction id in log file.");

    Vector tmpVec = new Vector(3);
    perl.split(tmpVec, (String)vecExtractTransactionIDX[parseParemNum].elementAt(0), sidString);
    sidString = tmpVec.elementAt(1).toString();

    if (((String)vecExtractTransactionIDX[parseParemNum].elementAt(1)).equals(DEFAULT_VEC_EXTRACT_NONE[1]))
      return sidString;

    Vector tmpVec2 = new Vector(3);
    perl.split(tmpVec2, (String)vecExtractTransactionIDX[parseParemNum].elementAt(1), tmpVec.elementAt(1).toString());

    sidString = tmpVec2.elementAt(0).toString();

    // "match" filter active?
    if(vecExtractTransactionIDX[parseParemNum].size()>3)
      if (!(perl.match((String)vecExtractTransactionIDX[parseParemNum].elementAt(3), sidString)))
         throw new MiningException("Note: Transaction ID has invalid format.");

    return sidString;
  }

  /**
   * Extracts the field number of the item ID from the mining vector
   * This is the "automatic" version, which "autodetects" the item ID
   *
   * @throws MiningException
   */
  public void setItemIDParameters() throws MiningException
  {
    // if not already initialized
    if (vecNumItemID==DEFAULT_FIELD_NUM)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -