📄 syntheticdatagenerator.java
字号:
current_transaction++; return transaction; } /** * Return the large itemsets used in the generation of transactions. * This can be useful for debugging. * * @return a Vector containing the large itemsets as Itemset objects. */ public Vector getLargeItemsets() { Vector large = new Vector(); for (int i = 0; i < large_itemsets.size(); i++) large.add(((LargeItemset)large_itemsets.get(i)).is); return large; } // this method creates a random pool of large itemsets that will // be used in the generation of transactions private void initLargeItemsets() { // used for selecting a random item rand_item = new Random(); // assign probabilities to items initItemProbabilities(); large_itemsets = new Vector(num_large_itemsets); RandomPoissonDistribution poisson_large_size = new RandomPoissonDistribution(avg_large_itemset_size - 1); RandomExponentialDistribution exp_correlation = new RandomExponentialDistribution(correlation_mean); RandomExponentialDistribution exp_weight = new RandomExponentialDistribution(); Random normal_corruption = new Random(); for (int i = 0; i < num_large_itemsets; i++) { // the large itemset size is obtained from a Poisson distribution // with mean avg_large_itemset_size int large_itemset_size = (int)poisson_large_size.nextLong() + 1; if (large_itemset_size > num_items) large_itemset_size = num_items; LargeItemset large = new LargeItemset(); large.is = new Itemset(large_itemset_size); if (i > 0) { // get previous large itemset LargeItemset prev_large = (LargeItemset)large_itemsets.get(i - 1); // we determine the fraction of items to use from the // previous itemset using an exponential distribution // with mean equal to correlation_mean // (we add 0.5 to round off) int fraction = (int)(((double)large_itemset_size) * exp_correlation.nextDouble() + 0.5); // make adjustments if necessary if (fraction > large_itemset_size) fraction = large_itemset_size; if (fraction > prev_large.is.size()) fraction = prev_large.is.size(); // select randomly the fraction of items from the previous // large itemset if (fraction > 0) { RandomSample rand_sample = new RandomSample(prev_large.is.size(), fraction, rand_sampling); long[] sample = rand_sample.nextSample(); for (int j = 0; j < sample.length; j++) large.is.addItem(prev_large.is.getItem((int)sample[j] - 1)); } } // add items randomly until we fill the itemset while (large.is.size() < large_itemset_size) large.is.addItem(nextRandomItem()); // we associate to this itemset a weight picked from // an exponential distribution with unit mean large.weight = exp_weight.nextDouble(); // we also assign a corruption level obtained from a // normal distribution with mean corruption_mean and variance 0.1 large.corruption = normal_corruption.nextGaussian() * 0.1 + corruption_mean; large_itemsets.add(large); } // now we have to normalize the weights of the large itemsets // such that their sum will total 1. This is done by dividing // each weight by their sum double sum = 0.0; for (int i = 0; i < num_large_itemsets; i++) sum += ((LargeItemset)large_itemsets.get(i)).weight; for (int i = 0; i < num_large_itemsets; i++) ((LargeItemset)large_itemsets.get(i)).weight /= sum; // finally we cumulate the probabilities in order to make it easier // to select one item randomly for (int i = 1; i < num_large_itemsets - 1; i++) { LargeItemset prev_large = (LargeItemset)large_itemsets.get(i - 1); LargeItemset large = (LargeItemset)large_itemsets.get(i); large.weight += prev_large.weight; } ((LargeItemset)large_itemsets.get(num_large_itemsets - 1)).weight = 1.0; } // we use the rand_large and the weights of the large itemsets to // select a large itemset randomly private LargeItemset nextRandomLargeItemset() { // this is a nice trick (courtesy Agrawal) for reusing // large itemsets in case they won't be used now // (just change the sign of last_large to "push back" // the choice - see ungetRandomLargeItemset()) if (last_large < 0) { last_large = -last_large; return (LargeItemset)large_itemsets.get(last_large); } double val = rand_large.nextDouble(); // do a binary search for the location i such that // weight(i - 1) < val <= weight(i) int i = 0; int left = 0; int right = num_large_itemsets - 1; while (right >= left) { int middle = (left + right) / 2; LargeItemset large = (LargeItemset)large_itemsets.get(middle); if (val < large.weight) right = middle - 1; else if (val > large.weight) left = middle + 1; else { i = middle; break; } } if (right < left) i = left; // in the case there were neighboring items with probability 0 while (i > 0 && ((LargeItemset)large_itemsets.get(i - 1)).weight == val) i--; // store last index chosed in case the itemset is not used now last_large = i; return (LargeItemset)large_itemsets.get(last_large); } // this method allows us to "push back" a selected large itemset // such that we use this choice the next time we need a large itemset private void ungetRandomLargeItemset() { if (last_large >= 0) last_large = -last_large; else System.err.println("Invalid call to ungetRandomLargeItemset()!"); } // we give probabilities to each item, these will be used to // choose the items to add to a large itemset private void initItemProbabilities() { item_probabilities = new double[num_items]; // the probabilities are generated with exponential distribution // with unit mean RandomExponentialDistribution exp = new RandomExponentialDistribution(); for (int i = 0; i < num_items; i++) item_probabilities[i] = exp.nextDouble(); // now we have to normalize these probabilities such that their // sum will total 1. This is done by dividing each probability // by their sum double sum = 0.0; for (int i = 0; i < num_items; i++) sum += item_probabilities[i]; for (int i = 0; i < num_items; i++) item_probabilities[i] /= sum; // finally we cumulate the probabilities in order to make it easier // to select one item randomly for (int i = 1; i < num_items - 1; i++) item_probabilities[i] += item_probabilities[i - 1]; item_probabilities[num_items - 1] = 1.0; } // we use the rand_item and the item_probabilities array to // select an item randomly private int nextRandomItem() { double val = rand_item.nextDouble(); // do a binary search for the location i such that // item_probabilities[i - 1] < val <= item_probabilities[i] int i = 0; int left = 0; int right = num_items - 1; while (right >= left) { int middle = (left + right) / 2; if (val < item_probabilities[middle]) right = middle - 1; else if (val > item_probabilities[middle]) left = middle + 1; else { i = middle; break; } } if (right < left) i = left; // in the case there were neighboring items with probability 0 while (i > 0 && item_probabilities[i - 1] == val) i--; return (i + 1); } /** * sample usage and testing */ public static void main(String[] args) { SyntheticDataGenerator syndatgen = new SyntheticDataGenerator(20, 7, 5, 5, 100); System.out.println(syndatgen.getLargeItemsets()); while (syndatgen.hasMoreTransactions()) System.out.println(syndatgen.getNextTransaction()); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -