📄 assoc.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
Classes for association rules from sparse data were written by Matjaz Jursic.
*/
#include <iomanip>
#include <fstream>
#include "random.hpp"
#include "examplegen.hpp"
#include "spec_gen.hpp"
#include "table.hpp"
#include "assoc.ppp"
DEFINE_TOrangeVector_classDescription(PAssociationRule, "TAssociationRules", true, ORANGE_API)
class TItemSetNode;
/* These objects are collected in TExampleSets, lists of examples that correspond to a particular tree node.
'example' is a unique example id (basically its index in the original dataset)
'weight' is the example's weight. */
class TExWei {
public:
int example;
float weight;
TExWei(const int &ex, const float &wei)
: example(ex),
weight(wei)
{}
};
/* This is a set of examples, used to list the examples that support a particular tree node */
typedef vector<TExWei> TExampleSet;
/* A tree element that corresponds to an attribute value (ie, TItemSetNode has as many
TlItemSetValues as there are values that appear in itemsets.
For each value, we have the 'examples' that support it, the sum of their weights
('support') and the branch that contains more specialized itemsets. */
class TItemSetValue {
public:
int value;
TItemSetNode *branch;
float support;
TExampleSet examples;
// This constructor is called when building the 1-tree
TItemSetValue(int al)
: value(al),
branch(NULL),
support(0.0)
{}
// This constructor is called when itemsets are intersected (makePairs ets)
TItemSetValue(int al, const TExampleSet &ex, float asupp)
: value(al),
branch(NULL),
support(asupp),
examples(ex)
{}
~TItemSetValue();
void sumSupport()
{
support = 0;
ITERATE(TExampleSet, wi, examples)
support += (*wi).weight;
}
};
/* TItemSetNode splits itemsets according to the value of attribute 'attrIndex';
each element of 'values' corresponds to an attribute value (not necessarily to all,
but only to those values that appear in itemsets).
Itemsets for which the value is not defined are stored in a subtree in 'nextAttribute'.
This can be seen in TItemSetTree::findSupport that finds a node that corresponds to the
given itemset */
class TItemSetNode {
public:
int attrIndex;
TItemSetNode *nextAttribute;
vector<TItemSetValue> values;
// This constructor is called by 1-tree builder which initializes all values (and later reduces them)
TItemSetNode(PVariable var, int anattri)
: attrIndex(anattri),
nextAttribute(NULL)
{
for(int vi = 0, ve = var->noOfValues(); vi<ve; vi++)
values.push_back(TItemSetValue(vi));
}
// This constructor is called when extending the tree
TItemSetNode(int anattri)
: attrIndex(anattri),
nextAttribute(NULL)
{}
~TItemSetNode()
{ mldelete nextAttribute; }
};
class TRuleTreeNode {
public:
int attrIndex;
int value;
float support;
TExampleSet examples;
TRuleTreeNode *nextAttribute;
TRuleTreeNode *hasValue;
TRuleTreeNode(const int ai, const int val, const float supp, const TExampleSet &ex)
: attrIndex(ai),
value(val),
support(supp),
examples(ex),
nextAttribute(NULL),
hasValue(NULL)
{}
~TRuleTreeNode()
{ mldelete nextAttribute;
mldelete hasValue;
}
};
TItemSetValue::~TItemSetValue()
{ mldelete branch;
}
TAssociationRule::TAssociationRule(PExample al, PExample ar)
: left(al),
right(ar),
support(0.0),
confidence(0.0),
coverage(0.0),
strength(0.0),
lift(0.0),
leverage(0.0),
nAppliesLeft(0),
nAppliesRight(0),
nAppliesBoth(0),
nExamples(0),
nLeft(countItems(al)),
nRight(countItems(ar))
{}
TAssociationRule::TAssociationRule(PExample al, PExample ar,
const float &napLeft, const float &napRight, const float &napBoth, const float &nExamples,
int anleft, int anright)
: left(al),
right(ar),
support(napBoth/nExamples),
confidence(napBoth/napLeft),
coverage(napLeft/nExamples),
strength(napRight/napLeft),
lift(nExamples * napBoth /napLeft / napRight),
leverage((napBoth*nExamples - napLeft*napRight)/nExamples/nExamples),
nAppliesLeft(napLeft),
nAppliesRight(napRight),
nAppliesBoth(napBoth),
nExamples(nExamples),
nLeft(anleft < 0 ? countItems(al) : anleft),
nRight(anright < 0 ? countItems(ar) : anright)
{
TExample::iterator ei, ee;
for(ei = left->begin(), ee = left->end(); ei!=ee; ei++)
if ((*ei).isSpecial())
(*ei).setDC();
for(ei = right->begin(), ee = right->end(); ei!=ee; ei++)
if ((*ei).isSpecial())
(*ei).setDK();
}
int TAssociationRule::countItems(PExample ex)
{
int res = 0;
PITERATE(TExample, ei, ex)
if (!(*ei).isSpecial())
res++;
return res;
}
bool TAssociationRule::applies(const TExample &ex, const PExample &side)
{
if (side->domain->variables->size())
return side->compatible(ex);
// all meta-attributes that appear in 'side' must also appear in 'ex' and be noSpecial
const_ITERATE(TMetaValues, mi, side->meta) {
if (!ex.hasMeta((*mi).first) || ex.getMeta((*mi).first).isSpecial())
return false;
}
return true;
}
float computeIntersection(const TExampleSet &set1, const TExampleSet &set2, TExampleSet &intersection)
{
float isupp = 0.0;
TExampleSet::const_iterator se1i(set1.begin()), se1e(set1.end());
TExampleSet::const_iterator se2i(set2.begin()), se2e(set2.end());
while((se1i != se1e) && (se2i != se2e)) {
if ((*se1i).example < (*se2i).example)
se1i++;
else if ((*se1i).example > (*se2i).example)
se2i++;
else {
intersection.push_back(*se1i);
isupp += (*se1i).weight;
se1i++;
se2i++;
}
}
return isupp;
}
/* Searches the tree to find a node that corresponds to itemset 'ex' and returns its support */
float findSupport(const TExample &ex, TItemSetNode *node)
{
vector<TItemSetValue>::iterator li = node->values.begin(); // This is initialized just to avoid warnings.
TExample::const_iterator ei(ex.begin()), eei(ex.end());
int attrIndex = 0;
for(; ei!=eei; ei++, attrIndex++)
// If attribute is in the itemset
if (!(*ei).isSpecial()) {
// Search for the attribute in the list linked by 'nextAttribute'
while (node && (node->attrIndex != attrIndex))
node = node->nextAttribute;
// this attribute does not appear in any itemset that begins with the attributes
// that we have already encountered in the example
if (!node)
return 0.0;
// Search for the value
for(li = node->values.begin(); (li!=node->values.end()) && ((*li).value != (*ei).intV); li++);
// this attribute value does not appear in any itemset ...
if (li==node->values.end())
return 0.0;
// continue if possible
if (!(*li).branch)
break;
node = (*li).branch;
}
// If we are not at the end of example yet, we must make sure no further values appear in the itemset
if (ei!=ex.end())
while((++ei!=ex.end()) && (*ei).isSpecial());
return (ei==ex.end()) ? (*li).support : 0;
}
float findSupport(const TExample &ex, TRuleTreeNode *node)
{
TExample::const_iterator ei(ex.begin()), eei(ex.end());
for(; (ei!=eei) && !(*ei).isSpecial(); ei++);
for(; ei!=eei; ei++, node = node->hasValue) {
while (node && (node->attrIndex != ei-ex.begin()))
node = node->nextAttribute;
if (!node || (node->value != (*ei).intV))
raiseError("internal error in RuleTree (attribute/value not found)");
while((++ei!=eei) && !(*ei).isSpecial());
if (ei==eei)
return node->support;
}
raiseError("internal error in RuleTree (attribute/value not found)");
return 0.0; //, to make compiler happy
}
TAssociationRulesInducer::TAssociationRulesInducer(float asupp, float aconf)
: maxItemSets(15000),
confidence(aconf),
support(asupp),
classificationRules(false)
{}
PAssociationRules TAssociationRulesInducer::operator()(PExampleGenerator examples, const int &weightID)
{
if (classificationRules && !examples->domain->classVar)
raiseError("cannot induce classification rules on classless data");
PITERATE(TVarList, vi, examples->domain->variables)
if ((*vi)->varType != TValue::INTVAR)
raiseError("cannot induce rules with non-discrete attributes (such as '%s')", (*vi)->name.c_str());
TItemSetNode *tree = NULL;
PAssociationRules rules;
try {
int depth, nOfExamples;
TDiscDistribution classDist;
buildTrees(examples, weightID, tree, depth, nOfExamples, classDist);
rules = classificationRules ? generateClassificationRules(examples->domain, tree, nOfExamples, classDist)
: generateRules(examples->domain, tree, depth, nOfExamples);
}
catch (...) {
mldelete tree;
throw;
}
mldelete tree;
return rules;
}
void TAssociationRulesInducer::buildTrees(PExampleGenerator gen, const int &weightID, TItemSetNode *&tree, int &depth, int &nOfExamples, TDiscDistribution &classDist)
{
float suppN;
depth = 1;
for(int totni = 0, ni = buildTree1(gen, weightID, tree, suppN, nOfExamples, classDist);
ni;
ni = buildNext1(tree, ++depth, suppN)) {
totni += ni;
if (totni>maxItemSets)
raiseError("too many itemsets (%i); increase 'maxItemSets'", totni);
}
--depth;
}
// buildTree1: builds the first tree with 1-itemsets
int TAssociationRulesInducer::buildTree1(PExampleGenerator gen, const int &weightID, TItemSetNode *&tree, float &suppN, int &nOfExamples, TDiscDistribution &classDist)
{
tree = NULL;
if (classificationRules)
classDist = TDiscDistribution(gen->domain->classVar);
int index, itemSets = 0;
TItemSetNode **toChange = &tree;
// builds an empty tree with all possible 1-itemsets
TVarList::const_iterator vi(gen->domain->variables->begin()), ve(gen->domain->variables->end());
for(index = 0; vi!=ve; vi++, index++) {
*toChange = mlnew TItemSetNode(*vi, index);
toChange = &((*toChange)->nextAttribute);
}
// fills the tree with indices of examples from gen
index = 0;
nOfExamples = 0;
TExampleIterator ei(gen->begin());
for(; ei; ++ei, index++) {
const float wex = WEIGHT(*ei);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -