📄 syntheticdatagenerator.java
字号:
/*ARMiner - Association Rules MinerCopyright (C) 2000 UMass/Boston - Computer Science DepartmentThis program is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2 of the License, or (atyour option) any later version.This program is distributed in the hope that it will be useful, butWITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNUGeneral Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307USAThe ARMiner Server was written by Dana Cristofor and LaurentiuCristofor.The ARMiner Client was written by Abdelmajid Karatihy, Xiaoyong Kuang,and Lung-Tsung Li.The ARMiner package is currently maintained by Laurentiu Cristofor(laur@cs.umb.edu).*/import java.util.*;/* Maintenance log started on November 19th, 2000 Nov. 19th, 2000 - I fixed a potential problem related to a bad way of initializing RandomSample, now I create one Random object and use it in all RandomSample objects. Also I use randTransaction as a source of random numbers for nextTransaction.*//** SyntheticDataGenerator.java<P> This class implements a synthetic data generator that generates data by simulating transactions in a supermarket. The algorithm is described in the article "Fast Algorithms for Mining Association Rules" by Rakesh Agrawal and Ramakrishnan Srikant from IBM Almaden Research Center, 1994. I have also used as additional information the C++ source code of the generator that is kindly distributed by Mr. Rakesh Agrawal and the Master Thesis of Mr. Andreas Mueller.<P> *//* This file is a part of the ARMiner project. (P)1999-2000 by ARMiner Server Team: Dana Cristofor Laurentiu Cristofor*/public class SyntheticDataGenerator{ private long num_transactions; private int avg_transaction_size; private int num_large_itemsets; private int avg_large_itemset_size; private int num_items; private double correlation_mean; private double corruption_mean; private long current_transaction; private Random rand_item; private Random rand_large; private Random rand_sampling; private Random rand_transaction; private RandomPoissonDistribution poisson_transaction_size; private class LargeItemset { Itemset is; double weight; double corruption; } private Vector large_itemsets; private double[] item_probabilities; private int last_large; /** * Create a new synthetic data generator with mean correlation 0.5 * and mean corruption 0.5. * * @param num_transactions the number of transactions to generate * @param avg_transaction_size the average size of a transaction * @param num_large_itemsets the number of large itemsets to be used * as patterns in the generation of transactions * @param avg_large_itemset_size the average size of a large itemset * @param num_items the number of items to appear in transactions * @exception IllegalArgumentException if the integer arguments * are not strictly positive or if the floating point arguments are * not between 0 and 1. */ public SyntheticDataGenerator(long num_transactions, int avg_transaction_size, int num_large_itemsets, int avg_large_itemset_size, int num_items) { this (num_transactions, avg_transaction_size, num_large_itemsets, avg_large_itemset_size, num_items, 0.5, 0.5); } /** * Create a new synthetic data generator. * * @param num_transactions the number of transactions to generate * @param avg_transaction_size the average size of a transaction * @param num_large_itemsets the number of large itemsets to be used * as patterns in the generation of transactions * @param avg_large_itemset_size the average size of a large itemset * @param num_items the number of items to appear in transactions * @param correlation_mean the mean correlation between the large * itemsets * @param corruption_mean the mean of the corruption coefficient * that will indicate how much a large itemset will be corrupted before * being used. * @exception IllegalArgumentException if the integer arguments * are not strictly positive or if the floating point arguments are * not between 0 and 1. */ public SyntheticDataGenerator(long num_transactions, int avg_transaction_size, int num_large_itemsets, int avg_large_itemset_size, int num_items, double correlation_mean, double corruption_mean) { if (num_transactions < 1 || avg_transaction_size < 1 || num_large_itemsets < 1 || avg_large_itemset_size < 1 || num_items < 1 || correlation_mean < 0 || correlation_mean > 1 || corruption_mean < 0 || corruption_mean > 1 || avg_transaction_size > num_items || avg_large_itemset_size > num_items) throw new IllegalArgumentException("Invalid arguments!"); this.num_transactions = num_transactions; this.avg_transaction_size = avg_transaction_size; this.num_large_itemsets = num_large_itemsets; this.avg_large_itemset_size = avg_large_itemset_size; this.num_items = num_items; this.correlation_mean = correlation_mean; this.corruption_mean = corruption_mean; current_transaction = 0; poisson_transaction_size = new RandomPoissonDistribution(avg_transaction_size - 1); // used for selecting a random large itemset rand_large = new Random(); // used by RandomSample rand_sampling = new Random(); // used in getNextTransaction rand_transaction = new Random(); initLargeItemsets(); } /** * Tell whether there are more transactions to generate. * * @return true if there are more transactions, false otherwise */ public boolean hasMoreTransactions() { return (current_transaction < num_transactions); } /** * Get next transaction. * * @exception NoSuchElementException if all transactions were generated * @return an Itemset representing the transaction */ public Itemset getNextTransaction() { if (current_transaction >= num_transactions) throw new NoSuchElementException("No more transactions to generate!"); // the transaction size is obtained from a Poisson distribution // with mean avg_transaction_size int transaction_size = (int)poisson_transaction_size.nextLong() + 1; if (transaction_size > num_items) transaction_size = num_items; Itemset transaction = new Itemset(transaction_size); while (transaction.size() < transaction_size) { LargeItemset pattern = nextRandomLargeItemset(); // we corrupt the pattern by reducing its size for as long // as a uniformly distributed random number is less than // the corruption of the large itemset. int pattern_length = pattern.is.size(); while (pattern_length > 0 && rand_transaction.nextDouble() < pattern.corruption) pattern_length--; // in case the large itemset does not fit in the transaction // we will put the itemset in the transaction anyway in 50% // of the cases, and in the rest we'll keep the itemset for // the next transaction if (pattern_length + transaction.size() > transaction_size) if (transaction.size() > 0 && rand_transaction.nextDouble() < 0.5) { // keep the itemset for next transaction ungetRandomLargeItemset(); break; } // now we have to pick pattern_length items at random from // the pattern if (pattern_length > 0) { RandomSample rand_sample = new RandomSample(pattern.is.size(), pattern_length, rand_sampling); long[] sample = rand_sample.nextSample(); for (int j = 0; j < sample.length; j++) transaction.addItem(pattern.is.getItem((int)sample[j] - 1)); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -