⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 apriori.cpp

📁 数据挖掘中频繁模式挖掘的经典算法
💻 CPP
字号:
/*----------------------------------------------------------------------
  File     : Item.cpp
  Contents : itemset management
  Author   : Bart Goethals
  Update   : 4/4/2003
----------------------------------------------------------------------*/

#include "Item.h"

#include <vector>
#include <set>
using namespace std;
#include "Data.h"

#include <algorithm>
#include <iostream>
#include <time.h>

#include "AprioriSets.h"

set<Item> *Item::makeChildren() const
{
	if(children) return children;
	return children = new set<Item>;
}

int Item::deleteChildren() const
{
	int deleted=0;

	if(children)
	{
		for(set<Item>::iterator it = children->begin(); it != children->end(); it++)
		{
			deleted += it->deleteChildren();
		}
		delete children;
		children = 0;
		deleted++;
	}

	return deleted;
}

/*----------------------------------------------------------------------
  File     : Data.cpp
  Contents : data set management
  Author   : Bart Goethals
  Update   : 4/4/2003
  ----------------------------------------------------------------------*/


Transaction::Transaction(const Transaction &tr)
{
  length = tr.length;
  t = new int[tr.length];
  for(int i=0; i< length; i++) t[i] = tr.t[i];
}

Data::Data(char *filename, int t)
{
  fn = filename;
  type = t;
  current=0;

  if(type>1) in = fopen(fn,"rt");
  else in = fopen(fn,"rb");
}

Data::~Data()
{
  if(in) fclose(in);
}

Transaction *Data::getNext()
{
  Transaction *t=0;

  switch(type)
    {
    case 1: t= getNextBin(); break;
    case 2: t= getNextAs(); break;
    case 3: t= getNextAsFlat(); break;
    case 4: t= getNextAsQuest(); break;
    }

  if(t) current++;
  else
    {
      rewind(in);
      current=0;
    }

  return t;
}

Transaction *Data::getNextAs()
{
  Transaction *t;
  int tid, item, i;
  vector<int> list;
  static int cur=0,prev=-1;
  static bool begin=true;

  if(feof(in))
    {
      begin=true;
      prev=-1;
      return 0;
    }

  if(!begin)
    {
      list.push_back(cur);
    }
  else begin=false;

  while(true)
    {
      int dummy;
      fscanf(in, "%d %d %d",&dummy, &tid, &item);
      //fscanf(in, "%d %d", &tid, &item);
      if(feof(in))
	{
	  int size=list.size();
	  t = new Transaction(size);
	  for(i=0; i<size; i++) t->t[i] = list[i];
	  list.clear();

	  return t;
	}
      else if(prev<0)	prev=tid;
      else if(tid != prev)
	{
	  prev = tid;
	  cur = item;
	  int size=list.size();
	  t = new Transaction(size);
	  for(i=0; i<size; i++) t->t[i] = list[i];
	  list.clear();
			
	  return t;
	}

      list.push_back(item);
    }
}

Transaction *Data::getNextAsFlat()
{
  vector<int> list;
  char c;

  // read list of items
  do {
    int item=0, pos=0;
    c = getc(in);
    while((c >= '0') && (c <= '9')) {
      item *=10;
      item += int(c)-int('0');
      c = getc(in);
      pos++;
    }
    if(pos) list.push_back(item);
  }while(c != '\n' && !feof(in));

  if(feof(in))
    return 0;
  // Note, also last transaction must end with newline,
  // else, it will be ignored

  // sort list of items
  // sort(list.begin(),list.end());

  // put items in Transaction structure
  Transaction *t = new Transaction(list.size());
  for(int i=0; i<int(list.size()); i++)
    t->t[i] = list[i];

  return t;
}

Transaction *Data::getNextAsQuest()
{
  int tmptid, tid,l,i;
  Transaction *t;
	
  fscanf(in,"%d %d %d",&tmptid,&tid,&l);
  if(feof(in)) return 0;
	
  t = new Transaction(l);
  for(i=0; i<l; i++)	
    fscanf(in,"%d",&t->t[i]);
  return t;
}

Transaction *Data::getNextBin()
{
  int tmptid, tid,l,i;
  Transaction *t;
	
  fread(&tmptid,4, 1,in);
  if(feof(in)) return 0;
	
  fread(&tid,4, 1,in);
  fread(&l,4, 1,in);
  t = new Transaction(l);
  for(i=0; i<l; i++)	
    {
      fread(&t->t[i],4, 1,in);
    }

  return t;
}





AprioriSets::AprioriSets()
{
  data=0;
  minsup=0;
  remap=0;
  relist=0;
  trie = new Item(0);
  verbose = false;
  countType = 1;
}

AprioriSets::~AprioriSets()
{
  if(data) delete data;
  if(trie) {
    trie->deleteChildren();
    delete trie;
  }
  if(remap) delete remap;
  if(relist) delete relist;
}

void AprioriSets::setData(char *fn, int type)
{
  data = new Data(fn, type);
}

int AprioriSets::setOutputSets(char *fn)
{
  setsout.open(fn);
  if(!setsout.is_open()) {
    cerr << "error: could not open " << fn << endl;
    return -1;
  }
  return 0;
}

int AprioriSets::generateSets()
{
  int total=0, pass=0;
  bool running = true;

  while(running) {
    clock_t start;
    int generated=0, nodes=0, tnr=0, pruned;

    pass++;
    cout << pass << " " << flush;

    if(pass>2) {
      start = clock();
      generated = generateCandidates(pass);
      nodes = pruneNodes(pass);
      if(verbose) cout << generated << " [" << (clock()-start)/double(CLOCKS_PER_SEC) << "s] " << flush;
    }

    start = clock();
    tnr = countCandidates(pass);
    if(verbose) {
      if(pass == 1) cout << trie->getChildren()->size() << " ";
      cout << tnr << " [" << (clock()-start)/double(CLOCKS_PER_SEC) << "s] " << flush;
    }

    if(pass==1 && setsout.is_open()) printSet(*trie,0,0);

    start = clock();
    pruned = pruneCandidates(pass);
    if(verbose) cout << pruned << " [" << (clock()-start)/double(CLOCKS_PER_SEC) << "s]\n" << flush;

    if(pass==1) ReOrder(); // Reorder all items

    total += pruned;
    if(pruned <= pass || pass>=iterNums) // modify here
		running = false;
  }

  cout << endl;

  return total;
}

int AprioriSets::generateCandidates(int level)
{
  int *tmp = new int[level];
  int generated = generateCandidates(level, trie->getChildren(), 1, tmp);
  delete [] tmp;

  return generated;
}

int AprioriSets::generateCandidates(int level, set<Item> *items, int depth, int *current)
{
  if(items == 0) return 0;

  int generated = 0;
  set<Item>::reverse_iterator runner;

  if(depth == level-1) {
    for(runner = items->rbegin(); runner != items->rend(); runner++) {
      current[depth-1] = runner->getId();
      set<Item> *children = runner->makeChildren();

      for(set<Item>::reverse_iterator it = items->rbegin(); it != runner; it++) {
	current[depth] = it->getId();
	if(level<=2 || checkSubsets(level,current, trie->getChildren(), 0, 1)) {
	  children->insert(Item(it->getId()));
	  generated++;
	}
      }
    }
  }
  else {
    for(runner = items->rbegin(); runner != items->rend(); runner++) {
      current[depth-1] = runner->getId();
      generated += generateCandidates(level, runner->getChildren(), depth+1, current);
    }
  }
  return generated;
}

bool AprioriSets::checkSubsets(int sl, int *iset, set<Item> *items, int spos, int depth)
{
  if(items==0) return 0;

  bool ok = true;
  set<Item>::iterator runner;
  int loper = spos;
  spos = depth+1;

  while(ok && (--spos >= loper)) {
    runner = items->find(Item(iset[spos]));
    if(runner != items->end()) {
      if(depth<sl-1) ok = checkSubsets(sl, iset, runner->getChildren(), spos+1, depth+1);
    }
    else ok=false;
  }

  return ok;
}

int AprioriSets::pruneNodes(int level)
{
  return pruneNodes(level,trie->getChildren(),1);
}

int AprioriSets::pruneNodes(int level, set<Item> *items, int depth)
{
  if(items == 0) return 0;

  int nodes = 0;

  if(depth==level) nodes = items->size();
  else {
    for(set<Item>::iterator runner = items->begin(); runner != items->end(); ) {
      int now = pruneNodes(level, runner->getChildren(), depth+1);
      if(now) {
	nodes += now;
	nodes++;
	runner++;
      }
      else {
	runner->deleteChildren();
	set<Item>::iterator tmp = runner++;
	items->erase(tmp);
      }
    }
  }

  return nodes;
}

int AprioriSets::countCandidates(int level)
{
  int trans=0;

  // count all single items
  if(level==1) {
    while(Transaction *t = data->getNext()) {
      trie->Increment();

      int *iset = t->t, sl = t->length;
      set<Item> *items = trie->makeChildren();
      for(int i=0; i<sl; i++) {
	Item item(iset[i]);
	set<Item>::iterator runner = items->find(item);
	if(runner == items->end()) runner = (items->insert(item)).first;
	runner->Increment();
      }
      trans++;
      delete t;
    }
  }
  else {
    while(Transaction *t = data->getNext()) {
      if(t->length >= level) {
	// Reorder transaction
	int i;
	vector<int> list;
	for(i=0; i<t->length; i++) {
	  set<Element>::iterator it = relist->find(Element(t->t[i]));
	  if(it != relist->end()) list.push_back(it->id);
	}
	int size=list.size();
	sort(list.begin(), list.end());
	delete t;
	t = new Transaction(size);
	for(i=0; i<size; i++) t->t[i] = list[i];
			
	if(countType==1 || level<=2)
	{
	  if(processTransaction(level, t, trie->getChildren(), 0, 1)) trans++;
	}
	else
	{
	  if(processTransaction2(level, t, trie->getChildren(), 0, 1)) trans++;
	}

	delete t;
      }
    }
  }
  return trans;
}

int AprioriSets::processTransaction2(int level, Transaction *t, set<Item> *items, int spos, int depth)
{
  if(items == 0) return 0;
  int used=0, max = t->length-level+depth;

  for(set<Item>::iterator it = items->begin(); spos<max  && it!=items->end(); it++)
  {
    while(spos<max && t->t[spos] < it->getId()) spos++;
    if(spos<max && (t->t[spos]==it->getId()) )
    {
      if(depth==level)
      {
	it->Increment();
	used++;
      }
      else used += processTransaction2(level,t,it->getChildren(),spos+1,depth+1);
    }
  }

  return used;
}

int AprioriSets::processTransaction(int level, Transaction *t, set<Item> *items, int spos, int depth)
{
  if(items == 0) return 0;
	
  int used=0, *iset = t->t, sl = t->length, loper = spos;
  set<Item>::iterator runner;

  spos = sl-(level-depth);
  while(--spos >= loper) {
    runner = items->find(Item(iset[spos]));
    if(runner != items->end()) {
      if(depth == level) {
	runner->Increment();
	used++;
      }
      else {
	if(depth==1 && level==2) runner->makeChildren();
	used += processTransaction(level, t, runner->getChildren(), spos+1, depth+1);
      }
    }
    else if(depth==2 && level==2) {
      set<Item> *singles = trie->getChildren();
      if(singles->find(Item(iset[spos])) != singles->end()) {
	runner = items->insert(Item(iset[spos])).first;
	runner->Increment();
	used++;
      }
    }
  }

  return used;
}

int AprioriSets::pruneCandidates(int level)
{
  int pruned;
  int *tmp = new int[level];
  pruned = pruneCandidates(level,trie->getChildren(),1,tmp);
  delete [] tmp;
  return pruned;
}

int AprioriSets::pruneCandidates(int level, set<Item> *items, int depth, int *itemset)
{
  if(items == 0) return 0;
  int left = 0;

  for(set<Item>::iterator runner = items->begin(); runner != items->end(); ) {
    itemset[depth-1] = runner->getId();

    if(depth == level) {
      if(runner->getSupport() < minsup) {
	runner->deleteChildren();
	set<Item>::iterator tmp = runner++;
	items->erase(tmp);
      }
      else {
	if(setsout.is_open()) printSet(*runner, itemset, depth);
	left++;
	runner++;
      }
    }
    else {	
      int now = pruneCandidates(level, runner->getChildren(), depth+1, itemset);
      if(now) {
	left += now;
	runner++;
      }
      else {
	runner->deleteChildren();
	set<Item>::iterator tmp = runner++;
	items->erase(tmp);
      }
    }
  }

  return left;
}

void AprioriSets::printSet(const Item& item, int *itemset, int length)
{
  set<int> outset;

  for(int j=0; j<length; j++)
    if(remap) outset.insert(remap[itemset[j]]);
    else outset.insert(itemset[j]);
  for(set<int>::iterator k=outset.begin(); k!=outset.end(); k++) setsout << *k << " ";
  setsout << ":" << item.getSupport() << endl;
}

void AprioriSets::ReOrder()
{
  set<Item> *src = trie->getChildren();
  set<Item>::iterator itI;
  multiset<Element>::iterator itE;
  multiset<Element> list;

  for(itI = src->begin();itI != src->end(); itI++)
    list.insert(Element(itI->getSupport(), itI->getId()));

  remap = new int[list.size()+1];
  relist = new set<Element>;
  src->clear();
  int i=1;
  for(itE=list.begin(); itE!=list.end();itE++) {
    if(itE->oldid >= minsup) {
      remap[i] = itE->id;
      relist->insert(Element(itE->id,i));
      Item a(i);
      a.Increment(itE->oldid);
      src->insert(a);
      i++;
    }
  }
}





int main(int argc, char *argv[])
{
	cout << "Apriori frequent itemset mining implementation" << endl;
	cout << "by Bart Goethals, 2000-2003" << endl;
	cout << "http://www.cs.helsinki.fi/u/goethals/" << endl << endl;

	if(argc < 5) {
		cerr << "usage: " << argv[0] << " datafile datatype minsup maxitemlength [output]" << endl;
		cerr << "datatype = 1 for Quest datagenerator binary" << endl;
		cerr << "datatype = 2 for Quest datagenerator ascii" << endl;
		cerr << "datatype = 3 for flat, i.e. all items per transaction on a single line" << endl;
		cerr << "datatype = 4 for ascii version of Quest datagenerator binary" << endl;
	}
	else {
		AprioriSets a;
		a.setVerbose(); // print information on nr of candidate itemsets etc
		a.setData(argv[1],atoi(argv[2]));

		a.setCountType(2);
		// 1: to check k-subsets of transaction in set of candidates
		// 2: to check all candidates in transaction (default - best performance)

		a.setMinSup(atoi(argv[3]));
		a.setMaxitemlength(atoi(argv[4]));
		if(argc==6) a.setOutputSets(argv[5]);

		clock_t start = clock();
		int sets = a.generateSets();
		cout << sets << "\t[" << (clock()-start)/double(CLOCKS_PER_SEC) << "s]" << endl;
		if(argc==6) cout << "Frequent sets written to " << argv[5] << endl;
	}

	return 0;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -