📄 syntheticdatagenerator.java

📁 数据挖掘的工具代码（包含fp-tree,appriory
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
    current_transaction++;    return transaction;  }  /**   * Return the large itemsets used in the generation of transactions.   * This can be useful for debugging.   *   * @return   a Vector containing the large itemsets as Itemset objects.   */  public Vector getLargeItemsets()  {    Vector large = new Vector();    for (int i = 0; i < large_itemsets.size(); i++)      large.add(((LargeItemset)large_itemsets.get(i)).is);    return large;  }  // this method creates a random pool of large itemsets that will  // be used in the generation of transactions  private void initLargeItemsets()  {    // used for selecting a random item    rand_item = new Random();    // assign probabilities to items    initItemProbabilities();    large_itemsets = new Vector(num_large_itemsets);    RandomPoissonDistribution poisson_large_size       = new RandomPoissonDistribution(avg_large_itemset_size - 1);    RandomExponentialDistribution exp_correlation      = new RandomExponentialDistribution(correlation_mean);    RandomExponentialDistribution exp_weight      = new RandomExponentialDistribution();    Random normal_corruption = new Random();    for (int i = 0; i < num_large_itemsets; i++)      {	// the large itemset size is obtained from a Poisson distribution	// with mean avg_large_itemset_size	int large_itemset_size = (int)poisson_large_size.nextLong() + 1;	if (large_itemset_size > num_items)	  large_itemset_size = num_items;	LargeItemset large = new LargeItemset();	large.is = new Itemset(large_itemset_size);	if (i > 0)	  {	    // get previous large itemset	    LargeItemset prev_large = (LargeItemset)large_itemsets.get(i - 1);	    // we determine the fraction of items to use from the	    // previous itemset using an exponential distribution	    // with mean equal to correlation_mean	    // (we add 0.5 to round off)	    int fraction = (int)(((double)large_itemset_size) 				 * exp_correlation.nextDouble() 				 + 0.5);	    // make adjustments if necessary	    if (fraction > large_itemset_size)	      fraction = large_itemset_size;	    if (fraction > prev_large.is.size())	      fraction = prev_large.is.size();	    // select randomly the fraction of items from the previous	    // large itemset	    if (fraction > 0)	      {		RandomSample rand_sample 		  = new RandomSample(prev_large.is.size(), 				     fraction, rand_sampling);		long[] sample = rand_sample.nextSample();		for (int j = 0; j < sample.length; j++)		  large.is.addItem(prev_large.is.getItem((int)sample[j] - 1));	      }	  }	// add items randomly until we fill the itemset	while (large.is.size() < large_itemset_size)	  large.is.addItem(nextRandomItem());	// we associate to this itemset a weight picked from	// an exponential distribution with unit mean	large.weight = exp_weight.nextDouble();	// we also assign a corruption level obtained from a	// normal distribution with mean corruption_mean and variance 0.1	large.corruption = normal_corruption.nextGaussian() * 0.1 	  + corruption_mean;	large_itemsets.add(large);      }    // now we have to normalize the weights of the large itemsets    // such that their sum will total 1. This is done by dividing     // each weight by their sum    double sum = 0.0;    for (int i = 0; i < num_large_itemsets; i++)      sum += ((LargeItemset)large_itemsets.get(i)).weight;    for (int i = 0; i < num_large_itemsets; i++)      ((LargeItemset)large_itemsets.get(i)).weight /= sum;    // finally we cumulate the probabilities in order to make it easier    // to select one item randomly    for (int i = 1; i < num_large_itemsets - 1; i++)      {	LargeItemset prev_large = (LargeItemset)large_itemsets.get(i - 1);	LargeItemset large = (LargeItemset)large_itemsets.get(i);	large.weight += prev_large.weight;      }    ((LargeItemset)large_itemsets.get(num_large_itemsets - 1)).weight = 1.0;  }  // we use the rand_large and the weights of the large itemsets to  // select a large itemset randomly  private LargeItemset nextRandomLargeItemset()  {    // this is a nice trick (courtesy Agrawal) for reusing    // large itemsets in case they won't be used now    // (just change the sign of last_large to "push back"     // the choice - see ungetRandomLargeItemset())    if (last_large < 0)      {	last_large = -last_large;	return (LargeItemset)large_itemsets.get(last_large);      }    double val = rand_large.nextDouble();    // do a binary search for the location i such that    // weight(i - 1) < val <= weight(i)    int i = 0;    int left = 0;    int right = num_large_itemsets - 1;    while (right >= left)      {	int middle = (left + right) / 2;	LargeItemset large = (LargeItemset)large_itemsets.get(middle);	if (val < large.weight)	  right = middle - 1;	else if (val > large.weight)	  left = middle + 1;	else	  {	    i = middle;	    break;	  }      }    if (right < left)      i = left;    // in the case there were neighboring items with probability 0    while (i > 0 	   && ((LargeItemset)large_itemsets.get(i - 1)).weight == val)      i--;    // store last index chosed in case the itemset is not used now    last_large = i;    return (LargeItemset)large_itemsets.get(last_large);  }  // this method allows us to "push back" a selected large itemset  // such that we use this choice the next time we need a large itemset  private void ungetRandomLargeItemset()  {    if (last_large >= 0)      last_large = -last_large;    else      System.err.println("Invalid call to ungetRandomLargeItemset()!");  }  // we give probabilities to each item, these will be used to  // choose the items to add to a large itemset  private void initItemProbabilities()  {    item_probabilities = new double[num_items];    // the probabilities are generated with exponential distribution    // with unit mean    RandomExponentialDistribution exp       = new RandomExponentialDistribution();    for (int i = 0; i < num_items; i++)      item_probabilities[i] = exp.nextDouble();    // now we have to normalize these probabilities such that their    // sum will total 1. This is done by dividing each probability     // by their sum    double sum = 0.0;    for (int i = 0; i < num_items; i++)      sum += item_probabilities[i];    for (int i = 0; i < num_items; i++)      item_probabilities[i] /= sum;    // finally we cumulate the probabilities in order to make it easier    // to select one item randomly    for (int i = 1; i < num_items - 1; i++)      item_probabilities[i] += item_probabilities[i - 1];    item_probabilities[num_items - 1] = 1.0;  }  // we use the rand_item and the item_probabilities array to  // select an item randomly  private int nextRandomItem()  {    double val = rand_item.nextDouble();    // do a binary search for the location i such that    // item_probabilities[i - 1] < val <= item_probabilities[i]    int i = 0;    int left = 0;    int right = num_items - 1;    while (right >= left)      {	int middle = (left + right) / 2;	if (val < item_probabilities[middle])	  right = middle - 1;	else if (val > item_probabilities[middle])	  left = middle + 1;	else	  {	    i = middle;	    break;	  }      }    if (right < left)      i = left;    // in the case there were neighboring items with probability 0    while (i > 0 && item_probabilities[i - 1] == val)      i--;    return (i + 1);  }  /**   * sample usage and testing   */  public static void main(String[] args)  {    SyntheticDataGenerator syndatgen       = new SyntheticDataGenerator(20, 7, 5, 5, 100);    System.out.println(syndatgen.getLargeItemsets());    while (syndatgen.hasMoreTransactions())      System.out.println(syndatgen.getNextTransaction());  }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -