📄 apriori.cpp
字号:
// Apriori.cpp: implementation of the CApriori class.
//输入一个实例list,里面的每个itemSet是个transaction,找到其中所有的频繁项集并 存放在 m_Ls中,返回
//////////////////////////////////////////////////////////////////////
#include "Apriori.h"
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
//---------------------------------------------------------------------------
// Class Apriori
// Method that finds all large itemsets for the given set of instances.
void CApriori::FindLargeItemSets(list *instances)
{
list *kMinusOneSets, *KSets;
itemSet *current;
int i = 0, j;
CHashTree *tree;
m_samples = instances; //实例list
m_sampleNum = (long)m_samples->size();//数据集中项集个数
necSupport = (long)(m_minSupport*(double)m_sampleNum + 0.5); //计算要达到m_minSupport需要一项至少出现的次数,四舍五入
KSets = singletons(); //找到1频繁项集
if (KSets->size() == 0)
return;
if(m_samples->size() <= 0)
return;
// Find k>=2 large itemsets
do
{
m_Ls->add(KSets); //m_ls里存放所有的I(I从1到K)频繁项集
kMinusOneSets = KSets; //kMinusOneSets做中间过渡作用?
KSets = selfjoin(kMinusOneSets, i); //如果这里要找K候选频繁集,i总是为K-2
delete kMinusOneSets;
tree = (CHashTree *)new CHashTree();
for(j = 0; j < KSets->size(); j++)
{
current = (itemSet *)KSets->get(j);
tree->insert(&(tree->root), (itemSet *)current->clone(), 0); //放入hash tree
// delete current;
}
delete KSets;
j= 0;
while(j < m_samples->size()) //对每个事务进行K匹配
{
current = (itemSet *)m_samples->get(j);
if(current->size() < i+2)
{
m_samples->remove(j); //无效事务
}
else
{
tree->subset(tree->root, current, 0); //这里有支持度的计数,从第0个位置开始匹配
j++;
}
// delete current;
}
KSets = (list *)new list();
tree->scan(tree->root, KSets, necSupport); ////找到给定树中支持度大于等于necSupport的所有itemSet,并放到KSets中,利用递归的方法
delete tree;
i++;
} while (KSets->size() > 0);
return;
}
// find 1 itemset
list * CApriori::singletons()
{
list *setOfItemSets = (list *)new list();
itemSet *current, *train;
long necSupport;
int i, j;
int *svector;
int itemid;
svector = (int *)new int[pagenum]; ///////小项item 个数
for(i = 0; i < pagenum; i++)
svector[i] = 0;
for(i = 0; i < m_samples->size(); i++)
{
train = (itemSet *)m_samples->get(i); //取出list的第i个itemSet
for(j = 0; j < train->size(); j++)
{
itemid = (int)train->get(j); //难道 itemid也从0开始,note here ************************************************
svector[itemid] += 1; //得到所有小项的计数
}
}
necSupport = (long)(m_minSupport*(double)(m_samples->size())+0.5); //这里重复赋值了吧
for(i = 0; i < pagenum; i++)
{
if((double)svector[i]/(double)(m_samples->size()) >= m_minSupport) //找到1频繁项集
{
current = new itemSet();
current->add(i);
current->support(svector[i]);
setOfItemSets->add(current);
// delete current;
}
}
/* delete all the items that are not in 1-lerge itemset */
//如果某个itemSet里有小项出现次数小于necSupport则从itemSet中移除,相当于剪枝
i = 0;
while (i < m_samples->size())
{
train = (itemSet *)m_samples->get(i);
j = 0;
while (j < train->size())
{
itemid = train->get(j);
if(svector[itemid] < necSupport)
train->remove(j);
else
j++;
}
if(train->size() == 0) //itemSet的所有 item都小于最小支持度就把它移除
m_samples->remove(i);
else
i++;
}
delete svector;
return setOfItemSets; //返回所有的1频繁项集
}
list * CApriori::selfjoin(list *in, int size)
{
list *newVector = new list();
itemSet *first;
itemSet *second;
itemSet *result;
int i, j;
for (i = 0; i < in->size(); i++) //利用双重循环进行合并
{
first = (itemSet *)in->get(i);
for (j = i+1; j < in->size(); j++)
{
second = (itemSet *)in->get(j);
if((result = join(first, second, size)) != NULL)
{
if(prune(in, result, size)) //
{
result->support(0);
newVector->add(result); //add
}
else
delete result;
}
// delete second;
}
// delete first;
}
return(newVector);
}
bool CApriori::prune(list *in, itemSet *attend, int size)
{
itemSet *temp;
bool result = true;
for(int i = 0; i < attend->size(); i++)
{
temp = (itemSet *)attend->clone();
temp->remove(i);
if(in->indexOf(temp) < 0)
{
delete temp;
result = true;
break;
}
delete temp;
}
return(result);
}
itemSet * CApriori::join(itemSet *first, itemSet *attend, int size)
{
itemSet *result = (itemSet *)NULL;
int i;
if( ! check(first, attend, size)) //判断first and attend的前size个小项是否一致
return(result);
result = (itemSet *)new itemSet();
for(i = 0; i < size; i++)
result->add(first->get(i));
if(first->get(i) < attend->get(i)) //每个itemSet还有一个小项要加入到result中,这里是按递增的顺序加入
{
result->add(first->get(i));
result->add(attend->get(i));
}
else
{
result->add(attend->get(i));
result->add(first->get(i));
}
return(result);
}
bool CApriori::check(itemSet *first, itemSet *attend, int size)
{
for(int i = 0; i < size; i++)
if(first->get(i) != attend->get(i))
return(false);
return(true);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -