📄 apriori.cpp
字号:
// Apriori.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <fstream>
#include <iostream>
#include <string>
#include <sstream>
#include <vector>
#include <iterator>
#include <stdlib.h>
#include <cassert>
using namespace std;
class ListNode
{
public:
string itemset;
int count;
ListNode ( string iset, int c = 0 )
{
itemset = iset;
count = c;
}
};
class List
{
public:
vector<ListNode> candidate_set;
vector<ListNode>::iterator iter;
};
int IsSubset(const string & str1 , const string & str2 );// judge wether str1 is the substring of str2
int IsJoinable(const string & str1, const string &str2, string & join);
// same length and first k-1 elements
int stringtovector(string & str, vector<int>& vint );
class Apr
{
public:
List* set_list ;
int current_itemnum;
vector< vector<int> > temporary_array;
Apr(int n = 3){
set_list = new List [ n + 1];
}
~Apr(){
delete[] set_list;
}
void setcurrent (int a)
{
current_itemnum = a;
}
void candidate_generate();
void prune();
int isfrequent( const ListNode & );
void prepare_array();
void clear_array();
};
void Apr ::candidate_generate()
{
assert(current_itemnum > 1 );
int k =current_itemnum - 1;
vector<ListNode>::iterator iter1,iter2;
for(iter1 = set_list[k].candidate_set.begin(); iter1 != set_list[k].candidate_set.end(); iter1++ )
{
iter2 = iter1 + 1;
while ( iter2 != set_list[k].candidate_set.end() )
{
string afterjoin;
if (IsJoinable( (*iter1).itemset, (*iter2).itemset, afterjoin) == 1)
{
set_list[current_itemnum].candidate_set.push_back( ListNode(afterjoin) ); // push back a new item;
}
iter2 ++;
}
}
}
void Apr::prune()
{
// for each k-itemset (k > 2), find if it contains a k-1 itemset that is not frequent;
// prepare the temporary vector
if(current_itemnum == 2) return;
prepare_array();
vector<ListNode>::iterator iter ;
for ( iter = (set_list[current_itemnum].candidate_set).begin(); iter != set_list[current_itemnum].candidate_set.end();)
{
if ( isfrequent( (*iter)) == 0 )
{
set_list[current_itemnum].candidate_set.erase(iter);
}
else
{
iter ++;
}
}
clear_array();
//clear the temporary vector
}
void Apr::prepare_array()
{
int k = current_itemnum - 1;
int array_size = set_list[k].candidate_set.size();
temporary_array.reserve(array_size);
vector<ListNode>::iterator iter;
for( iter = set_list[k].candidate_set.begin(); iter != set_list[k].candidate_set.end(); iter++ )
{
vector<int> temp;
if( stringtovector( (*iter).itemset, temp) == 0)
{
cout << "Error when string to vector ";
}
temporary_array.push_back(temp);
}
}
void Apr::clear_array()
{
int array_size = temporary_array.size();
int i;
for( i = 0 ; i < array_size; i++)
{
temporary_array[i].clear();
}
temporary_array.clear();
}
int Apr::isfrequent(const ListNode & testcurrent )
{
string str = testcurrent.itemset;
vector<int> originalstr;
// vector<int>::iterator iter;
// vector<ListNode>::iterator iter ;
// int k = current_itemnum - 1;
assert( stringtovector ( str,originalstr ) == 1);
/// vector<int> temp0;
// vector<int> temp;
int l ,i;
int array_size = temporary_array.size();
int flag = 0;
for( l = 0; l < current_itemnum - 1; l++ )
{
vector<int> temp0;
temp0 = originalstr;
vector<int> ::iterator iter = temp0.begin();
temp0.erase( iter + l );
// for each subset ,if there is not any match in the frequent k-1 itemset, then return false
flag = 0;
for (i = 0; i < array_size; i ++ )
{
if (temporary_array[i] == temp0 )
{
flag = 1;
break;
}
}
if ( flag == 0 ) return 0;
}
return 1;
}
const int itemnum = 1000;
const int n = 4;
int main(int argc, char* argv[])
{
ifstream in ;
unsigned int frequency[itemnum + 1];
unsigned int i;
unsigned int item_id;
Apr apr(n) ;
string transaction ;
// read the dataset of transaction
in.open("dataset.txt",ios::in);
for ( i = 1; i <= itemnum; i++)
{
frequency[i] = 0;
}
string str,strtemp;
// get the frequency of each item
while ( getline (in,str) )
{
istringstream sstr ;
sstr.str(str);
while ( sstr >> item_id )
{
cout << item_id << " ";
frequency[item_id] ++;
}
cout << endl;
}
// in.close();
// find the set of 1-item frequent ;
int support[n + 1] ; // support number for each itemset number
// string temp;
// ****************************************************************
//***************** assign the value for support array*************
for (i = 0 ; i <= n ; i++ )
{
support[i] = 2;
}
support[1] = 2;
//**********************************************************************
for(i = 1; i <= itemnum; i++)
{
if (frequency[i] >= support[1] )
{
string temp ;
char c2[2] ;
c2[0] = '0' + i;
c2[1] = '\0';
temp = c2;
// temp << i
apr.set_list[1].candidate_set.push_back(ListNode(temp,frequency[i]));
}
}
vector<ListNode> :: const_iterator citer ;
for(citer = apr.set_list[1].candidate_set.begin(); citer != apr.set_list[1].candidate_set.end();citer++ )
{
cout << (*citer).itemset << ":" << (*citer).count << endl;
}
/* string s1 = "1";
string s2 = "4 5 ";
string r;
if(IsJoinable(s1,s2,r) == 0)
{
cout << " cannot be joined" << endl;
}
else
{
cout << endl << r.c_str();
}
*/
//***************************************************
// set current item number
//
//*******************************************
// generate frequent itemset in loop
//********************************************
int current_num = 2;
while (current_num <= n){
apr.setcurrent(current_num);
// generate candidate set
apr.candidate_generate();
cout << "after generate" << current_num <<"-itemset"<< endl;
for(citer = apr.set_list[current_num].candidate_set.begin(); citer != apr.set_list[current_num].candidate_set.end();citer++ )
{
cout << (*citer).itemset << ":" << (*citer).count << endl;
}
/* for(citer = apr.set_list[2].candidate_set.begin(); citer != apr.set_list[2].candidate_set.end();citer++ )
{
cout << (*citer).itemset << ":" << (*citer).count << endl;
} */
cout << endl << endl ;
// prune and generate the final candidate set
apr.prune();
cout << "after prune" << current_num <<"-itemset"<< endl;
for(citer = apr.set_list[current_num].candidate_set.begin(); citer != apr.set_list[current_num].candidate_set.end();citer++ )
{
cout << (*citer).itemset << ":" << (*citer).count << endl;
}
//scan the database and delete the candidate that is not frequent in CANDIDATE SET WHICH IS THE DATA MEMBER IN THE APR
// in. open("dataset.txt",ios::in);
in.clear();
in.seekg(0,ios::beg);
while ( getline(in,str) )
{
vector<ListNode>::iterator iter;
for ( iter= apr.set_list[apr.current_itemnum].candidate_set.begin();
iter != apr.set_list[apr.current_itemnum].candidate_set.end(); iter ++)
{
if ( IsSubset((*iter).itemset ,str) )
{
(*iter).count ++;
}
}
}
cout << endl << "after scan the database:" << endl;
for(citer = apr.set_list[current_num].candidate_set.begin(); citer != apr.set_list[current_num].candidate_set.end();citer++ )
{
cout << (*citer).itemset << ":" << (*citer).count << endl;
}
cout << endl;
vector<ListNode>::iterator iter ;
// delete the elements with the support number
for(iter = apr.set_list[apr.current_itemnum].candidate_set.begin();
iter != apr.set_list[apr.current_itemnum].candidate_set.end();)
{
if((*iter).count < support[apr.current_itemnum])
{
////////////////////////////////////////
//cout<< (*iter).itemset.c_str();
apr.set_list[apr.current_itemnum].candidate_set.erase(iter);
}
else
{
iter++;
}
// if(iter == apr.set_list[apr.current_itemnum].candidate_set.end())
// break;
}
cout << "the final frequent " << current_num <<" itemset" << endl;
for(citer = apr.set_list[current_num].candidate_set.begin(); citer != apr.set_list[current_num].candidate_set.end();citer++ )
{
cout << (*citer).itemset << ":" << (*citer).count << endl;
}
current_num ++;
}
//*****************************************
//*********the end of the loop************
//*****************************************
in.close();
return 0;
} // end of main function
// see if str1 is the substring of string str2
// and presume that all the elements in the string is in increasing order
int IsSubset( const string & str1,const string & str2)
{
static const int npos = -1;
if (str1.length() == 0)
{
return 1; // as the empty set is the subset of any set
}
if(str2. length() == 0 && str1.length() != 0 )
{
return 0;
}
vector<int> v1, v2;
int a;
// push the string into the vector v1 one integer by another
int start = str1.find_first_not_of(' ');
if (start == npos)
{
return 0;
}
int end = str1. find_first_of(' ', start);
while ( end != npos && start != npos )
{
a = atoi( ( str1.substr(start, end - start) ).c_str() );
v1.push_back(a);
start = str1.find_first_not_of(' ', end);
end = str1. find_first_of(' ', start);
}
if (end == npos && start != npos ) // insert the last item that has been detected
{
a = atoi( ( str1.substr(start, str1.length() - start) ).c_str() );
v1.push_back(a);
}
// push the string into the vector v2 one integer by another
start = str2.find_first_not_of(' ');
if (start == npos)
{
return 0;
}
end = str2. find_first_of(' ', start);
while ( end != npos && start != npos )
{
a = atoi( ( str2.substr(start, end - start) ).c_str() );
v2.push_back(a);
start = str2.find_first_not_of(' ', end);
end = str2. find_first_of(' ', start);
}
if (end == npos && start != npos )
{
a = atoi( ( str2.substr(start, str1.length() - start) ).c_str() );
v2.push_back(a);
}
int k1 = v1.size();
int k2 = v2.size();
int i1 = 0;
int i2 = 0;
while (i1 < k1 && i2 < k2)
{
if (v1[i1] == v2[i2])
{
i1++;
}
i2++;
}
if (i1 < k1)
{
return 0;
}
else
{
return 1;
}
/* int start1 = str1.find_first_not_of(' ');
int end1 = str1. find_first_of(' ', start1);
int start2 = str2.find_first_not_of(' ');
int end2 = str2. find_first_of(' ', start2);
int a ,b;
a = atoi( ( str1.substr(start1, end1 - start1) ).c_str() );
b = atoi( ( str2.substr(start2, end2 - start2) ).c_str() );
while(start1 != npos && start2 != npos)
{
while ( a != b)
{
int start2 = str2.find_first_not_of(' ', end2);
int end2 = str2. find_first_of(' ', start2);
b = atoi( ( str2.substr(start2, end2 - start2) ).c_str() );
}
int start1 = str2.find_first_not_of(' ', end1);
int end1 = str2. find_first_of(' ', start1);
a = atoi( ( str2.substr(start1, end1 - start1) ).c_str() );
int start2 = str2.find_first_not_of(' ', end2);
int end2 = str2. find_first_of(' ', start2);
b = atoi( ( str2.substr(start2, end2 - start2) ).c_str() );
}
if ( start1 != npos && start2 == npos )
return 0;
else return 1; */
}
int IsJoinable(const string & str1, const string &str2, string & join)
// same length and first k-1 elements
{
// change the two string into two int vector;
static const int npos = -1;
vector<int> v1, v2;
int a;
// push the string into the vector v1 one integer by another
int start = str1.find_first_not_of(' ');
if (start == npos)
{
return 0;
}
int end = str1. find_first_of(' ', start);
while ( end != npos && start != npos )
{
a = atoi( ( str1.substr(start, end - start) ).c_str() );
v1.push_back(a);
start = str1.find_first_not_of(' ', end);
end = str1. find_first_of(' ', start);
}
if (end == npos && start != npos ) // insert the last item that has been detected
{
a = atoi( ( str1.substr(start, str1.length() - start) ).c_str() );
v1.push_back(a);
}
// push the string into the vector v2 one integer by another
start = str2.find_first_not_of(' ');
if (start == npos)
{
return 0;
}
end = str2. find_first_of(' ', start);
while ( end != npos && start != npos )
{
a = atoi( ( str2.substr(start, end - start) ).c_str() );
v2.push_back(a);
start = str2.find_first_not_of(' ', end);
end = str2. find_first_of(' ', start);
}
if (end == npos && start != npos )
{
a = atoi( ( str2.substr(start, str1.length() - start) ).c_str() );
v2.push_back(a);
}
// judge if the two vector can be joined
int k = v1.size();
int i = 0;
if (v1.size() != v2.size() )
{
return 0;
}
// vector<int> iter1,iter2;
if ( k != 1 )
{
while ( i < k-1 )
{
if (v1[i] != v2[i])
{
return 0;
}
i++;
}
}
if(v1[i] < v2[i] )
{
v1.push_back(v2[i]);
}
else
{
v1.push_back(v1[i]);
v1[i] = v2[i];
}
for( i = 0; i < v1.size(); i++)
{
char ptr[80];
_itoa(v1[i],ptr,10);
join += ptr;
join += " ";
}
join = join.erase(join.size() -1 ,1);
return 1;
}
int stringtovector (string & str, vector<int> & vint )
{
static const int npos = -1;
int a;
// push the string into the vector v1 one integer by another
int start = str.find_first_not_of(' ');
if (start == npos)
{
return 0;
}
int end = str. find_first_of(' ', start);
while ( end != npos && start != npos )
{
a = atoi( ( str.substr(start, end - start) ).c_str() );
vint.push_back(a);
start = str.find_first_not_of(' ', end);
end = str. find_first_of(' ', start);
}
if (end == npos && start != npos ) // insert the last item that has been detected
{
a = atoi( ( str.substr(start, str.length() - start) ).c_str() );
vint.push_back(a);
}
return 1;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -