📄 dhp.java

📁 一个数据挖掘系统的源码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页

/**
 *
 *   AgentAcademy - an open source Data Mining framework for
 *   training intelligent agents
 *
 *   Copyright (C)   2001-2003 AA Consortium.
 *
 *   This library is open source software; you can redistribute it
 *   and/or modify it under the terms of the GNU Lesser General
 *   Public License as published by the Free Software Foundation;
 *   either version 2.0 of the License, or (at your option) any later
 *   version.
 *
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU Lesser General Public
 *   License along with this library; if not, write to the Free
 *   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *   MA  02111-1307 USA
 *
 */

package org.agentacademy.modules.dataminer.association;

/**
 * <p>Title: The Data Miner prototype</p>
 * <p>Description: A prototype for the DataMiner (DM), the Agent Academy (AA) module responsible for performing data mining on the contents of the Agent Use Repository (AUR). The extracted knowledge is to be sent back to the AUR in the form of a PMML document.</p>
 * <p>Copyright: Copyright (c) 2002</p>
 * <p>Company: CERTH</p>
 * @author asymeon
 * @version 0.31
 */

import java.io.*;
import java.util.*;
import org.agentacademy.modules.dataminer.core.*;
import org.agentacademy.modules.dataminer.filters.Filter;
import org.agentacademy.modules.dataminer.filters.AttributeFilter;
import org.jdom.*;
import org.jdom.output.*;
import org.apache.log4j.Logger;

/**
 * Class implementing an Apriori-type algorithm. Iteratively reduces the minimum
 * support until it finds the required number of rules with the given minimum
 * confidence. <p>
 *
 * Valid options are:<p>
 *
 * -N required number of rules <br>
 * The required number of rules (default: 10). <p>
 *
 * -T type of metric by which to sort rules <br>
 * 0 = confidence <p>
 *
 * -C minimum confidence of a rule <br>
 * The minimum confidence of a rule (default: 0.9). <p>
 *
 * -D delta for minimum support <br>
 * The delta by which the minimum support is decreased in
 * each iteration (default: 0.05). <p>
 *
 * -U upper bound for minimum support <br>
 * The upper bound for minimum support. Don't explicitly look for
 * rules with more than this level of support. <p>
 *
 * -M lower bound for minimum support <br>
 * The lower bound for the minimum support (default = 0.1). <p>
 *
 * -S significance level <br>
 * If used, rules are tested for significance at
 * the given level. Slower (default = no significance testing). <p>
 *
 * -R <br>
 * If set then columns that contain all missing values are removed from
 * the data.
 *
 * -I <br>
 * If set the itemsets found are also output (default = no). <p>
 *
 */

 public class DHP extends Associator implements OptionHandler {

   public static Logger                log = Logger.getLogger(DHP.class);
  /**The pmml Document */
  public static Document pmmlDocument = null;

  /** The minimum support. */
  protected double m_minSupport;

  /** The upper bound on the support */
  protected double m_upperBoundMinSupport;

  /** The lower bound for the minimum support. */
  protected double m_lowerBoundMinSupport;

  /** Metric types. */
  protected static final int CONFIDENCE = 0;
  public static final Tag [] TAGS_SELECTION = {
    new Tag(CONFIDENCE, "Confidence"),
      };

  /** The selected metric type. */
  protected int m_metricType = CONFIDENCE;

  /** The minimum metric score. */
  protected double m_minMetric;

  /** The maximum number of rules that are output. */
  protected int m_numRules;

  /** Delta by which m_minSupport is decreased in each iteration. */
  protected double m_delta;

  /** Significance level for optional significance test. */
  protected double m_significanceLevel;

  /** Number of cycles used before required number of rules was one. */
  protected int m_cycles;

  /** The set of all sets of itemsets L. */
  protected FastVector m_Ls;

  /** The same information stored in hash tables. */
  protected FastVector m_hashtables;

  /** The list of all generated rules. */
  protected FastVector[] m_allTheRules;

  /** The instances (transactions) to be used for generating
      the association rules. */
  protected Instances m_instances;

  /** Output itemsets found? */
  protected boolean m_outputItemSets;

  protected boolean m_removeMissingCols;

  /** Report progress iteratively */
  protected boolean m_verbose;

  /**
   * Returns a string describing this associator
   * @return a description of the evaluator suitable for
   * displaying in the gui
   */
  public String globalInfo() {
    return "Finds association rules.";
  }

  /**
   * Constructor that allows to sets default values for the
   * minimum confidence and the maximum number of rules
   * the minimum confidence.
   */
  public DHP() {

    resetOptions();
  }

  /**
   * Resets the options to the default values.
   */
  public void resetOptions() {

    m_removeMissingCols = false;
    m_verbose = false;
    m_delta = 0.05;
    m_minMetric = 0.90;
    m_numRules = 10;
    m_lowerBoundMinSupport = 0.1;
    m_upperBoundMinSupport = 1.0;
    m_significanceLevel = -1;
    m_outputItemSets = false;
  }

  /**
   * Removes columns that are all missing from the data
   * @param instances the instances
   * @return a new set of instances with all missing columns removed
   */
  private Instances removeMissingColumns(Instances instances)
    throws Exception {
    int numInstances = instances.numInstances();
    StringBuffer deleteString = new StringBuffer();
    int removeCount = 0;
    boolean first = true;
    int maxCount = 0;

    for (int i=0;i<instances.numAttributes();i++) {
      AttributeStats as = instances.attributeStats(i);
      if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
	// see if we can decrease this by looking for the most frequent value
	int [] counts = as.nominalCounts;
	if (counts[Utils.maxIndex(counts)] > maxCount) {
	  maxCount = counts[Utils.maxIndex(counts)];
	}
      }
      if (as.missingCount == numInstances) {
	if (first) {
	  deleteString.append((i+1));
	  first = false;
	} else {
	  deleteString.append(","+(i+1));
	}
	removeCount++;
      }
    }
    if (m_verbose) {
      System.err.println("Removed : "+removeCount+" columns with all missing "
			 +"values.");
    }
    if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
      m_upperBoundMinSupport = (double)maxCount / (double)numInstances;
      if (m_verbose) {
	System.err.println("Setting upper bound min support to : "
			   +m_upperBoundMinSupport);
      }
    }

    if (deleteString.toString().length() > 0) {
      AttributeFilter af = new AttributeFilter();
      af.setAttributeIndices(deleteString.toString());
      af.setInvertSelection(false);
      af.setInputFormat(instances);
      Instances newInst = Filter.useFilter(instances, af);

      return newInst;
    }
    return instances;
  }

  /**
   * Method that generates all large itemsets with a minimum support, and from
   * these all association rules with a minimum confidence.
   *
   * @param instances the instances to be used for generating the associations
   * @exception Exception if rules can't be built successfully
   */
  public void buildAssociations(Instances instances) throws Exception {

    double[] confidences, supports;
    int[] indices;
    FastVector[] sortedRuleSet;
    int necSupport=0;

    if (instances.checkForStringAttributes()) {
      throw new Exception("Can't handle string attributes!");
    }

    if (m_removeMissingCols) {
      instances = removeMissingColumns(instances);
    }

    // Decrease minimum support until desired number of rules found.
    m_cycles = 0;
    m_minSupport = m_upperBoundMinSupport - m_delta;
    m_minSupport = (m_minSupport < m_lowerBoundMinSupport)
      ? m_lowerBoundMinSupport
      : m_minSupport;
    do {

      // Reserve space for variables
      m_Ls = new FastVector();
      m_hashtables = new FastVector();
      m_allTheRules = new FastVector[6];
      m_allTheRules[0] = new FastVector();
      m_allTheRules[1] = new FastVector();
      m_allTheRules[2] = new FastVector();
      if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	m_allTheRules[3] = new FastVector();
	m_allTheRules[4] = new FastVector();
	m_allTheRules[5] = new FastVector();
      }
      sortedRuleSet = new FastVector[6];
      sortedRuleSet[0] = new FastVector();
      sortedRuleSet[1] = new FastVector();
      sortedRuleSet[2] = new FastVector();
      if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	sortedRuleSet[3] = new FastVector();
	sortedRuleSet[4] = new FastVector();
	sortedRuleSet[5] = new FastVector();
      }

      // Find large itemsets and rules
      findLargeItemSets(instances);
      if (m_significanceLevel != -1 || m_metricType != CONFIDENCE)
	findRulesBruteForce();
      else
	findRulesQuickly();

      // Sort rules according to their support
      supports = new double[m_allTheRules[2].size()];
      for (int i = 0; i < m_allTheRules[2].size(); i++)
	supports[i] = (double)((ItemSet)m_allTheRules[1].elementAt(i)).support();
      indices = Utils.stableSort(supports);
      for (int i = 0; i < m_allTheRules[2].size(); i++) {
	sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[i]));
	sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[i]));
	sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[i]));
	if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	  sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[i]));
	  sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[i]));
	  sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[i]));
	}
      }

      // Sort rules according to their confidence
      m_allTheRules[0].removeAllElements();
      m_allTheRules[1].removeAllElements();
      m_allTheRules[2].removeAllElements();
      if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	m_allTheRules[3].removeAllElements();
	m_allTheRules[4].removeAllElements();
	m_allTheRules[5].removeAllElements();
      }
      confidences = new double[sortedRuleSet[2].size()];
      int sortType = 2 + m_metricType;

      for (int i = 0; i < sortedRuleSet[2].size(); i++)
	confidences[i] =
	  ((Double)sortedRuleSet[sortType].elementAt(i)).doubleValue();
      indices = Utils.stableSort(confidences);
      for (int i = sortedRuleSet[0].size() - 1;
	   (i >= (sortedRuleSet[0].size() - m_numRules)) && (i >= 0); i--) {
	m_allTheRules[0].addElement(sortedRuleSet[0].elementAt(indices[i]));
	m_allTheRules[1].addElement(sortedRuleSet[1].elementAt(indices[i]));
	m_allTheRules[2].addElement(sortedRuleSet[2].elementAt(indices[i]));
	if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
	  m_allTheRules[3].addElement(sortedRuleSet[3].elementAt(indices[i]));
	  m_allTheRules[4].addElement(sortedRuleSet[4].elementAt(indices[i]));
	  m_allTheRules[5].addElement(sortedRuleSet[5].elementAt(indices[i]));
	}
      }
      m_minSupport -= m_delta;
      m_minSupport = (m_minSupport < m_lowerBoundMinSupport)
	? 0
	: m_minSupport;

      necSupport = (int)(m_minSupport *
			 (double)instances.numInstances()+0.5);

      m_cycles++;
      if (m_verbose) {
	if (m_Ls.size() > 1) {
	  System.out.println(toString());
	}
      }
    } while ((m_allTheRules[0].size() < m_numRules) &&
	     (m_minSupport >= m_lowerBoundMinSupport)
	     /*	     (necSupport >= lowerBoundNumInstancesSupport)*/
	     /*	     (Utils.grOrEq(m_minSupport, m_lowerBoundMinSupport)) */ &&
	     (necSupport >= 1));
    m_minSupport += m_delta;
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {

    String string1 = "\tThe required number of rules. (default = " + m_numRules + ")",
      string2 =
      "\tThe minimum confidence of a rule. (default = " + m_minMetric + ")",
      string3 = "\tThe delta by which the minimum support is decreased in\n",
      string4 = "\teach iteration. (default = " + m_delta + ")",
      string5 =
      "\tThe lower bound for the minimum support. (default = " +
      m_lowerBoundMinSupport + ")",
      string6 = "\tIf used, rules are tested for significance at\n",
      string7 = "\tthe given level. Slower. (default = no significance testing)",
      string8 = "\tIf set the itemsets found are also output. (default = no)",
      stringType = "\tThe metric type by which to rank rules. (default = "
      +"confidence)";


    FastVector newVector = new FastVector(9);

    newVector.addElement(new Option(string1, "N", 1,
				    "-N <required number of rules output>"));
    newVector.addElement(new Option(stringType, "T", 1,
				    "-T <0=confidence"));
    newVector.addElement(new Option(string2, "C", 1,
				    "-C <minimum metric score of a rule>"));
    newVector.addElement(new Option(string3 + string4, "D", 1,
				    "-D <delta for minimum support>"));
    newVector.addElement(new Option("\tUpper bound for minimum support. "
				    +"(default = 1.0)", "U", 1,
				     "-U <upper bound for minimum support>"));
    newVector.addElement(new Option(string5, "M", 1,
				    "-M <lower bound for minimum support>"));
    newVector.addElement(new Option(string6 + string7, "S", 1,
				    "-S <significance level>"));
    newVector.addElement(new Option(string8, "S", 0,
				    "-I"));
    newVector.addElement(new Option("\tRemove columns that contain "
				    +"all missing values (default = no)"
				    , "R", 0,
				    "-R"));
    newVector.addElement(new Option("\tReport progress iteratively. (default "
				    +"= no)", "V", 0,
				    "-V"));

    return newVector.elements();
  }
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -