📄 associationrules.pm
字号:
package Data::Mining::AssociationRules;
use strict;
use warnings;
BEGIN {
use Exporter ();
use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
$AUTHOR = 'Dan Frankowski <dfrankow@winternet.com>';
@EXPORT = @EXPORT_OK = qw(generate_frequent_sets
generate_rules
read_frequent_sets
read_transaction_file
set_debug);
%EXPORT_TAGS = ();
@ISA = qw(Exporter);
$VERSION = 0.1;
}
my $debug = 0;
=head1 NAME
Data::Mining:AssociationRules - Mine association rules and frequent
sets from data.
=head1 SYNOPSIS
use Data::Mining::AssociationRules;
my %transaction_map;
my $transaction_file = "foo.txt";
read_transaction_file(\%transaction_map, $transaction_file);
generate_frequent_sets(\%transaction_map, $output_file_prefix,
$support_threshold, $max_n);
generate_rules($output_file_prefix, $support_threshold,
$confidence_threshold, $max_n);
read_frequent_sets($set_map_ref, $file_prefix)
set_debug(1);
perl arm.pl -transaction-file foo.txt -support 2 -confidence-threshold 0.01 -max-set-size 6
See also FUNCTIONS, DESCRIPTION, and EXAMPLES below.
=head1 INSTALLATION
The typical:
=over
=item 0 perl Makefile.PL
=item 0 make test
=item 0 make install
=back
=head1 FUNCTIONS
=cut
=pod
=head2 read_transaction_file($transaction_map_ref, $transaction_file)
Read in a transaction map from a file which has lines of two
whitespace-separated columns:
=over
transaction-id item-id
=back
=cut
sub read_transaction_file {
my $transaction_map_ref = shift;
my $transaction_file = shift;
open(BFILE, $transaction_file) or die "Couldn't open $transaction_file: $!\n";
while ( <BFILE> ) {
my @data = split;
die "Expected 2 columns, found ", int(@data), "\n" if int(@data) != 2;
my ($tid, $item) = @data;
$$transaction_map_ref{$item}{$tid}++;
}
close(BFILE);
}
=pod
=head2 generate_frequent_sets ($transaction_map_ref, $file_prefix, $support_threshold, $max_n)
Given
=over
=item 0 a map of transactions
=item 0 a file prefix
=item 0 a support threshold
=item 0 a maximum frequent set size to look for (optional)
=back
generate the frequent sets in some files, one file per size of the set.
That is, all 1-sets are in a file, all 2-sets in another, etc.
The files are lines of the form:
=over
support-count item-set
=back
where
=over
=item 0 support-count is the number of transactions in which the item-set appears
=item 0 item-set is one or more space-separated items
=back
=cut
sub generate_frequent_sets {
my $transaction_map_ref = shift;
my $file_prefix = shift;
my $support_threshold = shift;
my $max_n = shift;
# Generate 1-sets
my $n = 1;
my $out_nset = nset_filename($n, $file_prefix, $support_threshold);
open(OUT, ">$out_nset") or die "Couldn't open $out_nset for writing: $!\n";
while (my ($item, $item_map) = each %{$transaction_map_ref}) {
my $support = int(keys(%$item_map));
if ($support >= $support_threshold) {
print OUT "$support $item\n";
}
}
my $num_nsets = int(keys(%{$transaction_map_ref}));
print STDERR "$num_nsets $n-sets\n" if $debug;
close(OUT);
# Generate n-sets
my $done = 0;
while ($num_nsets > 0) {
$n++;
$num_nsets = 0;
last if defined($max_n) && ($n > $max_n);
# Go through (n-1)-sets, pruning as you go
my $prior_nset = nset_filename($n-1, $file_prefix, $support_threshold);
open(PRIOR, $prior_nset) or die "Couldn't open $prior_nset: $!\n";
$out_nset = nset_filename($n, $file_prefix, $support_threshold);
open(OUT, ">$out_nset") or die "Couldn't open $out_nset: $!\n";
while ( <PRIOR> ) {
my ($count, @set) = split;
# Create userset, which contains the intersection of $transaction{@set}
my %userset = % {$$transaction_map_ref{$set[0]}};
foreach my $item ( @set[1 .. $#set] ) {
while (my ($user, $dummy) = each %userset) {
if (!exists($$transaction_map_ref{$item}{$user})) {
delete($userset{$user});
}
}
}
# For each 1-set, intersect further, and spit out if > support_threshold
while (my ($item, $user_set) = each %{$transaction_map_ref}) {
# Only spit sets of non-decreasing elements
# This keeps out duplicates
my $dup_set = 0;
foreach my $set_item ( @set ) {
if ($set_item ge $item) {
$dup_set = 1;
last;
}
}
if (!$dup_set) {
my %newset = %userset;
while (my ($user, $dummy) = each %newset) {
if (!exists($$user_set{$user})) {
delete($newset{$user});
}
}
#print "newset is now " . map_str(\%newset) . "\n";
my $num_users = int(keys(%newset));
#print "item $item set @set numusers is $num_users\n";
if ($num_users >= $support_threshold) {
print OUT "$num_users @set $item\n";
$num_nsets++;
}
}
}
}
close(PRIOR);
close(OUT);
print STDERR "$num_nsets $n-sets\n" if ($num_nsets > 0) && $debug;
unlink($out_nset) if 0 == $num_nsets;
}
}
=pod
=head2 read_frequent_sets($set_map_ref, $file_prefix)
Given
=over
=item 0 a set map
=item 0 a file prefix
=item 0 support threshold
=item 0 max frequent set size (optional)
=back
read all the frequent sets into a single map, which has as its key the
frequent set (joined by single spaces) and as its value the support.
=cut
sub read_frequent_sets {
my $set_map_ref = shift;
my $file_prefix = shift;
my $support_threshold = shift;
my $max_n = shift;
opendir(DIR, '.') || die "can't opendir '.': $!";
my @files = grep { /^$file_prefix/ && -f "./$_" } readdir(DIR);
closedir DIR;
foreach my $file (@files) {
# print STDERR "Read file $file ..\n";
if ( $file =~ /${file_prefix}\-support\-(\d+)\-(\d+)set/ ) {
my $support = $1;
my $n = $2;
next if ($support != $support_threshold)
|| (defined($max_n) && ($n > $max_n));
open(SETS, $file) or die "Couldn't open $file: $!\n";
while ( <SETS> ) {
my ($count, @set) = split;
$$set_map_ref{join(' ', @set)} = $count;
}
close(SETS);
}
}
}
# =pod
# =head2 nset_filename($n, $file_prefix, $support_threshold)
# Given
# =over
# =item 0 set size
# =item 0 a file prefix
# =item 0 a support threshold
# =back
# return the name of the file that contains the specified frequent sets.
# =cut
sub nset_filename {
my $n = shift;
my $file_prefix = shift;
my $support_threshold = shift;
return $file_prefix . "-support-" . $support_threshold . "-" . $n . "set.txt";
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -