📄 apriori.cpp
字号:
/*----------------------------------------------------------------------
File : Item.cpp
Contents : itemset management
Author : Bart Goethals
Update : 4/4/2003
----------------------------------------------------------------------*/
#include "Item.h"
#include <vector>
#include <set>
using namespace std;
#include "Data.h"
#include <algorithm>
#include <iostream>
#include <time.h>
#include "AprioriSets.h"
set<Item> *Item::makeChildren() const
{
if(children) return children;
return children = new set<Item>;
}
int Item::deleteChildren() const
{
int deleted=0;
if(children)
{
for(set<Item>::iterator it = children->begin(); it != children->end(); it++)
{
deleted += it->deleteChildren();
}
delete children;
children = 0;
deleted++;
}
return deleted;
}
/*----------------------------------------------------------------------
File : Data.cpp
Contents : data set management
Author : Bart Goethals
Update : 4/4/2003
----------------------------------------------------------------------*/
Transaction::Transaction(const Transaction &tr)
{
length = tr.length;
t = new int[tr.length];
for(int i=0; i< length; i++) t[i] = tr.t[i];
}
Data::Data(char *filename, int t)
{
fn = filename;
type = t;
current=0;
if(type>1) in = fopen(fn,"rt");
else in = fopen(fn,"rb");
}
Data::~Data()
{
if(in) fclose(in);
}
Transaction *Data::getNext()
{
Transaction *t=0;
switch(type)
{
case 1: t= getNextBin(); break;
case 2: t= getNextAs(); break;
case 3: t= getNextAsFlat(); break;
case 4: t= getNextAsQuest(); break;
}
if(t) current++;
else
{
rewind(in);
current=0;
}
return t;
}
Transaction *Data::getNextAs()
{
Transaction *t;
int tid, item, i;
vector<int> list;
static int cur=0,prev=-1;
static bool begin=true;
if(feof(in))
{
begin=true;
prev=-1;
return 0;
}
if(!begin)
{
list.push_back(cur);
}
else begin=false;
while(true)
{
int dummy;
fscanf(in, "%d %d %d",&dummy, &tid, &item);
//fscanf(in, "%d %d", &tid, &item);
if(feof(in))
{
int size=list.size();
t = new Transaction(size);
for(i=0; i<size; i++) t->t[i] = list[i];
list.clear();
return t;
}
else if(prev<0) prev=tid;
else if(tid != prev)
{
prev = tid;
cur = item;
int size=list.size();
t = new Transaction(size);
for(i=0; i<size; i++) t->t[i] = list[i];
list.clear();
return t;
}
list.push_back(item);
}
}
Transaction *Data::getNextAsFlat()
{
vector<int> list;
char c;
// read list of items
do {
int item=0, pos=0;
c = getc(in);
while((c >= '0') && (c <= '9')) {
item *=10;
item += int(c)-int('0');
c = getc(in);
pos++;
}
if(pos) list.push_back(item);
}while(c != '\n' && !feof(in));
if(feof(in))
return 0;
// Note, also last transaction must end with newline,
// else, it will be ignored
// sort list of items
// sort(list.begin(),list.end());
// put items in Transaction structure
Transaction *t = new Transaction(list.size());
for(int i=0; i<int(list.size()); i++)
t->t[i] = list[i];
return t;
}
Transaction *Data::getNextAsQuest()
{
int tmptid, tid,l,i;
Transaction *t;
fscanf(in,"%d %d %d",&tmptid,&tid,&l);
if(feof(in)) return 0;
t = new Transaction(l);
for(i=0; i<l; i++)
fscanf(in,"%d",&t->t[i]);
return t;
}
Transaction *Data::getNextBin()
{
int tmptid, tid,l,i;
Transaction *t;
fread(&tmptid,4, 1,in);
if(feof(in)) return 0;
fread(&tid,4, 1,in);
fread(&l,4, 1,in);
t = new Transaction(l);
for(i=0; i<l; i++)
{
fread(&t->t[i],4, 1,in);
}
return t;
}
AprioriSets::AprioriSets()
{
data=0;
minsup=0;
remap=0;
relist=0;
trie = new Item(0);
verbose = false;
countType = 1;
}
AprioriSets::~AprioriSets()
{
if(data) delete data;
if(trie) {
trie->deleteChildren();
delete trie;
}
if(remap) delete remap;
if(relist) delete relist;
}
void AprioriSets::setData(char *fn, int type)
{
data = new Data(fn, type);
}
int AprioriSets::setOutputSets(char *fn)
{
setsout.open(fn);
if(!setsout.is_open()) {
cerr << "error: could not open " << fn << endl;
return -1;
}
return 0;
}
int AprioriSets::generateSets()
{
int total=0, pass=0;
bool running = true;
while(running) {
clock_t start;
int generated=0, nodes=0, tnr=0, pruned;
pass++;
cout << pass << " " << flush;
if(pass>2) {
start = clock();
generated = generateCandidates(pass);
nodes = pruneNodes(pass);
if(verbose) cout << generated << " [" << (clock()-start)/double(CLOCKS_PER_SEC) << "s] " << flush;
}
start = clock();
tnr = countCandidates(pass);
if(verbose) {
if(pass == 1) cout << trie->getChildren()->size() << " ";
cout << tnr << " [" << (clock()-start)/double(CLOCKS_PER_SEC) << "s] " << flush;
}
if(pass==1 && setsout.is_open()) printSet(*trie,0,0);
start = clock();
pruned = pruneCandidates(pass);
if(verbose) cout << pruned << " [" << (clock()-start)/double(CLOCKS_PER_SEC) << "s]\n" << flush;
if(pass==1) ReOrder(); // Reorder all items
total += pruned;
if(pruned <= pass || pass>=iterNums) // modify here
running = false;
}
cout << endl;
return total;
}
int AprioriSets::generateCandidates(int level)
{
int *tmp = new int[level];
int generated = generateCandidates(level, trie->getChildren(), 1, tmp);
delete [] tmp;
return generated;
}
int AprioriSets::generateCandidates(int level, set<Item> *items, int depth, int *current)
{
if(items == 0) return 0;
int generated = 0;
set<Item>::reverse_iterator runner;
if(depth == level-1) {
for(runner = items->rbegin(); runner != items->rend(); runner++) {
current[depth-1] = runner->getId();
set<Item> *children = runner->makeChildren();
for(set<Item>::reverse_iterator it = items->rbegin(); it != runner; it++) {
current[depth] = it->getId();
if(level<=2 || checkSubsets(level,current, trie->getChildren(), 0, 1)) {
children->insert(Item(it->getId()));
generated++;
}
}
}
}
else {
for(runner = items->rbegin(); runner != items->rend(); runner++) {
current[depth-1] = runner->getId();
generated += generateCandidates(level, runner->getChildren(), depth+1, current);
}
}
return generated;
}
bool AprioriSets::checkSubsets(int sl, int *iset, set<Item> *items, int spos, int depth)
{
if(items==0) return 0;
bool ok = true;
set<Item>::iterator runner;
int loper = spos;
spos = depth+1;
while(ok && (--spos >= loper)) {
runner = items->find(Item(iset[spos]));
if(runner != items->end()) {
if(depth<sl-1) ok = checkSubsets(sl, iset, runner->getChildren(), spos+1, depth+1);
}
else ok=false;
}
return ok;
}
int AprioriSets::pruneNodes(int level)
{
return pruneNodes(level,trie->getChildren(),1);
}
int AprioriSets::pruneNodes(int level, set<Item> *items, int depth)
{
if(items == 0) return 0;
int nodes = 0;
if(depth==level) nodes = items->size();
else {
for(set<Item>::iterator runner = items->begin(); runner != items->end(); ) {
int now = pruneNodes(level, runner->getChildren(), depth+1);
if(now) {
nodes += now;
nodes++;
runner++;
}
else {
runner->deleteChildren();
set<Item>::iterator tmp = runner++;
items->erase(tmp);
}
}
}
return nodes;
}
int AprioriSets::countCandidates(int level)
{
int trans=0;
// count all single items
if(level==1) {
while(Transaction *t = data->getNext()) {
trie->Increment();
int *iset = t->t, sl = t->length;
set<Item> *items = trie->makeChildren();
for(int i=0; i<sl; i++) {
Item item(iset[i]);
set<Item>::iterator runner = items->find(item);
if(runner == items->end()) runner = (items->insert(item)).first;
runner->Increment();
}
trans++;
delete t;
}
}
else {
while(Transaction *t = data->getNext()) {
if(t->length >= level) {
// Reorder transaction
int i;
vector<int> list;
for(i=0; i<t->length; i++) {
set<Element>::iterator it = relist->find(Element(t->t[i]));
if(it != relist->end()) list.push_back(it->id);
}
int size=list.size();
sort(list.begin(), list.end());
delete t;
t = new Transaction(size);
for(i=0; i<size; i++) t->t[i] = list[i];
if(countType==1 || level<=2)
{
if(processTransaction(level, t, trie->getChildren(), 0, 1)) trans++;
}
else
{
if(processTransaction2(level, t, trie->getChildren(), 0, 1)) trans++;
}
delete t;
}
}
}
return trans;
}
int AprioriSets::processTransaction2(int level, Transaction *t, set<Item> *items, int spos, int depth)
{
if(items == 0) return 0;
int used=0, max = t->length-level+depth;
for(set<Item>::iterator it = items->begin(); spos<max && it!=items->end(); it++)
{
while(spos<max && t->t[spos] < it->getId()) spos++;
if(spos<max && (t->t[spos]==it->getId()) )
{
if(depth==level)
{
it->Increment();
used++;
}
else used += processTransaction2(level,t,it->getChildren(),spos+1,depth+1);
}
}
return used;
}
int AprioriSets::processTransaction(int level, Transaction *t, set<Item> *items, int spos, int depth)
{
if(items == 0) return 0;
int used=0, *iset = t->t, sl = t->length, loper = spos;
set<Item>::iterator runner;
spos = sl-(level-depth);
while(--spos >= loper) {
runner = items->find(Item(iset[spos]));
if(runner != items->end()) {
if(depth == level) {
runner->Increment();
used++;
}
else {
if(depth==1 && level==2) runner->makeChildren();
used += processTransaction(level, t, runner->getChildren(), spos+1, depth+1);
}
}
else if(depth==2 && level==2) {
set<Item> *singles = trie->getChildren();
if(singles->find(Item(iset[spos])) != singles->end()) {
runner = items->insert(Item(iset[spos])).first;
runner->Increment();
used++;
}
}
}
return used;
}
int AprioriSets::pruneCandidates(int level)
{
int pruned;
int *tmp = new int[level];
pruned = pruneCandidates(level,trie->getChildren(),1,tmp);
delete [] tmp;
return pruned;
}
int AprioriSets::pruneCandidates(int level, set<Item> *items, int depth, int *itemset)
{
if(items == 0) return 0;
int left = 0;
for(set<Item>::iterator runner = items->begin(); runner != items->end(); ) {
itemset[depth-1] = runner->getId();
if(depth == level) {
if(runner->getSupport() < minsup) {
runner->deleteChildren();
set<Item>::iterator tmp = runner++;
items->erase(tmp);
}
else {
if(setsout.is_open()) printSet(*runner, itemset, depth);
left++;
runner++;
}
}
else {
int now = pruneCandidates(level, runner->getChildren(), depth+1, itemset);
if(now) {
left += now;
runner++;
}
else {
runner->deleteChildren();
set<Item>::iterator tmp = runner++;
items->erase(tmp);
}
}
}
return left;
}
void AprioriSets::printSet(const Item& item, int *itemset, int length)
{
set<int> outset;
for(int j=0; j<length; j++)
if(remap) outset.insert(remap[itemset[j]]);
else outset.insert(itemset[j]);
for(set<int>::iterator k=outset.begin(); k!=outset.end(); k++) setsout << *k << " ";
setsout << ":" << item.getSupport() << endl;
}
void AprioriSets::ReOrder()
{
set<Item> *src = trie->getChildren();
set<Item>::iterator itI;
multiset<Element>::iterator itE;
multiset<Element> list;
for(itI = src->begin();itI != src->end(); itI++)
list.insert(Element(itI->getSupport(), itI->getId()));
remap = new int[list.size()+1];
relist = new set<Element>;
src->clear();
int i=1;
for(itE=list.begin(); itE!=list.end();itE++) {
if(itE->oldid >= minsup) {
remap[i] = itE->id;
relist->insert(Element(itE->id,i));
Item a(i);
a.Increment(itE->oldid);
src->insert(a);
i++;
}
}
}
int main(int argc, char *argv[])
{
cout << "Apriori frequent itemset mining implementation" << endl;
cout << "by Bart Goethals, 2000-2003" << endl;
cout << "http://www.cs.helsinki.fi/u/goethals/" << endl << endl;
if(argc < 5) {
cerr << "usage: " << argv[0] << " datafile datatype minsup maxitemlength [output]" << endl;
cerr << "datatype = 1 for Quest datagenerator binary" << endl;
cerr << "datatype = 2 for Quest datagenerator ascii" << endl;
cerr << "datatype = 3 for flat, i.e. all items per transaction on a single line" << endl;
cerr << "datatype = 4 for ascii version of Quest datagenerator binary" << endl;
}
else {
AprioriSets a;
a.setVerbose(); // print information on nr of candidate itemsets etc
a.setData(argv[1],atoi(argv[2]));
a.setCountType(2);
// 1: to check k-subsets of transaction in set of candidates
// 2: to check all candidates in transaction (default - best performance)
a.setMinSup(atoi(argv[3]));
a.setMaxitemlength(atoi(argv[4]));
if(argc==6) a.setOutputSets(argv[5]);
clock_t start = clock();
int sets = a.generateSets();
cout << sets << "\t[" << (clock()-start)/double(CLOCKS_PER_SEC) << "s]" << endl;
if(argc==6) cout << "Frequent sets written to " << argv[5] << endl;
}
return 0;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -