📄 associationrulesdecompalgorithm.java
字号:
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Michael Thess
* @version 1.0
*/
package com.prudsys.pdm.Models.AssociationRules;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAlgorithmParameter;
import com.prudsys.pdm.Core.MiningAlgorithmSpecification;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningEvent;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.MiningListener;
import com.prudsys.pdm.Core.Event.Status.CreationModelBeginMessage;
import com.prudsys.pdm.Input.MiningStoredData;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Models.AssociationRules.Event.CreationModelEndMessageAssociationRules;
import com.prudsys.pdm.Utils.GeneralUtils;
import com.prudsys.pdm.Utils.IntHashtable;
import com.prudsys.pdm.Utils.IntVector;
/**
* Wrapper for association rule algorithms based on a simple
* horizontal decomposition method of the MiningInputStream
* into stripes. This allows to apply association rules
* algorithms also on data which does not fit completely
* into memory.
*
* All stripes except the last one have the same length given by
* the parameter 'decompSize' specifying the number of transactions
* (not items!) contained in the stripes. The last stripe contains the remaining
* transactions and its transaction length <= decompSize. In the first run
* through the data, the association rules algorithm, which is specified
* via the parameter 'algorithmName', is locally applied to all stripes
* and the resulting rules are stored globally. In the second run,
* the rules are pruned via calculating their global support / confidence
* and removing all rules violating the global support / confidence condition.
* No matter how the parameter 'decompSize' is chosen, the result will
* be always the same.
*
* Three important remarks:
* 1. Unlike most memory-based association rules algorithms in XELOPES,
* this class requires that the MiningInputStream is already sorted
* by the transaction ID attribute.
* 2. An advantage over most memory-based association rules algorithms is
* that this class can also handle transaction ID attributes of the
* type 'unstoredCategories'. This means that categories (i.e. the transaction
* identifiers) are not stored in the transaction ID attribute and hence
* this memory is saved. This is especially useful when the algorithm
* deals with very large datasets with millions of transactions.
* 3. When choose 'decompSize' make sure that the last stripe is not
* much smaller then the other ones. A too small last stripe would
* result in a hugh number of locally calculated rules and extremely
* slow down the work of the whole decomposition algorithm (especially
* in the second - pruning - stage). You can also simply exclude the
* last domain from local rule generation (see option 'skipLastDomain')
* in the first stage. However, this may possibly reduce the set of all
* globally found rules leading to an approximate solution.
*
* @author Michael Thess
* @version 1.1
* @see AssociationRulesAlgorithm
*/
public class AssociationRulesDecompAlgorithm extends AssociationRulesAlgorithm implements MiningListener
{
/** Maximum number of transactions in decomposition. */
private int decompSize = 10000;
/** Name of local association rules algorithm from 'algorithms.xml'. */
private String algorithmName = "AprioriOptimized";
/** Approximate solution (exclude last domain from local rule generation). */
private boolean skipLastDomain = false;
/** Minimum item size. */
private int minimumItemSize = 1;
/** Maximum item size. */
private int maximumItemSize = -1;
/** Local transaction ID attribute. */
private CategoricalAttribute transactIdLoc;
/** Global number of all transactions. */
private int numberOfTransactions = -1;
/** Hashtable of large itemsets. */
private Hashtable lits = new Hashtable();
/** Hashtable of association rules. */
private Hashtable rules = new Hashtable();
/** Hashtable of large items. Points to itemTrans. */
private IntHashtable largeItems = new IntHashtable();
/** Vector of all transactions containing a large item. */
private IntVector[] itemTrans;
/** Debug level. */
private int debug = 1;
public AssociationRulesDecompAlgorithm() {
this.addMiningListener(this);
}
public void processMiningEvent(MiningEvent e) {
}
/**
* Runs association rules algorithm.
*
* @exception MiningException connot run algorithm
*/
protected void runAlgorithm() throws MiningException {
// Init:
lits.clear();
rules.clear();
largeItems.clear();
// Construct local metadata:
MiningDataSpecification locMetaData = new MiningDataSpecification();
locMetaData.addMiningAttribute( categoryItemId );
transactIdLoc = categoryTransactId;
if ( categoryTransactId.isUnstoredCategories() ) // use local transact attribute
transactIdLoc = new CategoricalAttribute( categoryTransactId.getName() );
locMetaData.addMiningAttribute( transactIdLoc );
locMetaData.setRelationName("local meta data");
/************************************************************************/
/* First scan over data to construct local rules... */
/************************************************************************/
// Reset mining input stream:
miningInputStream.reset();
// MAIN LOOP:
int nDecomp = 0; // number of decompositions
int nLocTrans = 0; // counter of local transactions
boolean firstItem = true; // first line
Category prevTransId = new Category(""); // previous transaction ID
ArrayList transact = new ArrayList(); // current transaction (= itemset)
ArrayList locMinVec = new ArrayList(); // current decomposition set
fireMiningEvent(new CreationModelBeginMessage(this.getAlgorithmLevel()));
while (miningInputStream.next()) {
// Read new mining vector:
MiningVector vector = miningInputStream.read();
// Empty line => ignore line:
if (vector == null)
continue;
// Get item and transaction IDs:
double itemId = vector.getValue( categoryItemId );
double transId = vector.getValue( categoryTransactId );
Category currTransId = categoryTransactId.getCategory(transId);
// Missing value => ignore line:
if ( Category.isMissingValue(itemId) || currTransId == null )
continue;
// First item:
if (firstItem) {
prevTransId = currTransId;
firstItem = false;
};
// New transaction:
if ( !currTransId.equals(prevTransId) ) {
// Add to local transaction set:
locMinVec.addAll(transact);
nLocTrans = nLocTrans + 1;
// New decomposition domain => run algorithm:
if (nLocTrans == decompSize) {
MiningStoredData msd = new MiningStoredData(locMinVec);
AssociationRulesMiningModel model = runAlgorithmLocal(msd);
addLargeItemSets( model.getLargeItemSets() );
addAssociationRules( model.getAssociationRules() );
nDecomp = nDecomp + 1;
nLocTrans = 0;
locMinVec.clear();
if ( categoryTransactId.isUnstoredCategories() )
transactIdLoc.removeValues();
if (debug > 0) System.out.println("iDecomp = " + nDecomp + " =========>");
};
prevTransId = currTransId;
transact.clear();
};
// Add item to transaction:
if ( categoryTransactId.isUnstoredCategories() ) {
transId = transactIdLoc.getKey(currTransId);
if (Category.isMissingValue(transId) )
transId = transactIdLoc.addCategory(currTransId);
};
double[] vec = {itemId, transId};
MiningVector mv = new MiningVector(vec);
mv.setMetaData(locMetaData);
transact.add(mv);
};
// Last transaction and last algorithm run:
if (transact.size() > 0 && (!skipLastDomain || nDecomp == 0) ) {
locMinVec.addAll(transact);
nLocTrans = nLocTrans + 1;
MiningStoredData msd = new MiningStoredData(locMinVec);
AssociationRulesMiningModel model = runAlgorithmLocal(msd);
addLargeItemSets( model.getLargeItemSets() );
addAssociationRules( model.getAssociationRules() );
nDecomp = nDecomp + 1;
};
if ( categoryTransactId.isUnstoredCategories() )
transactIdLoc.removeValues();
if (debug > 0)
System.out.println("nDecomp = " + nDecomp + " --------------------");
/************************************************************************/
/* ... first scan over data to construct local rules. */
/************************************************************************/
// Fill hashtable of all one-element large itemsets:
int nLit = 0;
Enumeration em = lits.keys();
while (em.hasMoreElements() ) {
ItemSet is = (ItemSet) em.nextElement();
int itemSize = is.getSize();
is.setSupportCount(0); // clear support count which was local
for (int j = 0; j < itemSize; j++) {
int pN = is.getIntegerAt(j);
if (largeItems.get(pN) == null) {
largeItems.put(pN, nLit);
nLit = nLit + 1;
};
};
};
// Initialize transaction vector:
itemTrans = new IntVector[nLit];
/************************************************************************/
/* Second scan over data to get global rules by pruning... */
/************************************************************************/
// Reset mining input stream:
miningInputStream.reset();
// MAIN LOOP:
int nTrans = 0; // number of all transactions
nDecomp = 0; // number of decompositions
nLocTrans = 0; // counter of local transactions
firstItem = true; // first line
prevTransId = new Category(""); // previous transaction ID
transact = new ArrayList(); // current transaction (= itemset)
locMinVec = new ArrayList(); // current decomposition set
while (miningInputStream.next()) {
// Read new mining vector:
MiningVector vector = miningInputStream.read();
// Empty line => ignore line:
if (vector == null)
continue;
// Get item and transaction IDs:
double itemId = vector.getValue( categoryItemId );
double transId = vector.getValue( categoryTransactId );
Category currTransId = categoryTransactId.getCategory(transId);
// Missing value => ignore line:
if ( Category.isMissingValue(itemId) || currTransId == null )
continue;
// First item:
if (firstItem) {
prevTransId = currTransId;
firstItem = false;
};
// New transaction:
if ( !currTransId.equals(prevTransId) ) {
// Add to local transaction set:
locMinVec.addAll(transact);
nLocTrans = nLocTrans + 1;
// New decomposition domain => update support and confidence numbers:
if (nLocTrans == decompSize) {
MiningStoredData msd = new MiningStoredData(locMinVec);
updateSuppConf(msd);
nTrans = nTrans + nLocTrans;
nDecomp = nDecomp + 1;
nLocTrans = 0;
locMinVec.clear();
if ( categoryTransactId.isUnstoredCategories() )
transactIdLoc.removeValues();
if (debug > 0) System.out.println("iDecomp = " + nDecomp + " <========");
};
prevTransId = currTransId;
transact.clear();
};
// Add item to transaction:
if ( categoryTransactId.isUnstoredCategories() ) {
transId = transactIdLoc.getKey(currTransId);
if (Category.isMissingValue(transId) )
transId = transactIdLoc.addCategory(currTransId);
};
double[] vec = {itemId, transId};
MiningVector mv = new MiningVector(vec);
mv.setMetaData(locMetaData);
transact.add(mv);
};
// Last transaction and last update support and confidence numbers:
if (transact.size() > 0) {
locMinVec.addAll(transact);
nLocTrans = nLocTrans + 1;
nTrans = nTrans + nLocTrans;
nDecomp = nDecomp + 1;
MiningStoredData msd = new MiningStoredData(locMinVec);
updateSuppConf(msd);
};
numberOfTransactions = nTrans;
/************************************************************************/
/* ...second scan over data to get global rules by pruning. */
/************************************************************************/
// Remove all large itemsets with too small support:
int minSupp = (int) (minimumSupport*nTrans + 0.9999999);
em = lits.keys();
while (em.hasMoreElements() ) {
ItemSet is = (ItemSet) em.nextElement();
int suppCt = is.getSupportCount();
if (suppCt < minSupp)
lits.remove(is);
else
lits.put(is, new Integer(suppCt));
};
// Remove all rules with too small support or confidence:
em = rules.keys();
while (em.hasMoreElements()) {
// Get rule:
RuleSet rs = (RuleSet) em.nextElement();
// Get premise and conclusion if large itemsets:
ItemSet prem = rs.getPremise();
ItemSet conc = rs.getConclusion();
Integer SuppA = (Integer) lits.get(prem);
Integer SuppB = (Integer) lits.get(conc);
if (SuppA == null || SuppB == null) {
rules.remove(rs);
continue;
};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -