⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 associationrules.pm

📁 perl implementation of the apriori algorithm
💻 PM
📖 第 1 页 / 共 2 页
字号:
package Data::Mining::AssociationRules;

use strict;
use warnings;

BEGIN {
        use Exporter ();
        use vars qw ($AUTHOR $VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
        $AUTHOR      = 'Dan Frankowski <dfrankow@winternet.com>';
        @EXPORT      = @EXPORT_OK = qw(generate_frequent_sets
                                       generate_rules
                                       read_frequent_sets
                                       read_transaction_file
                                       set_debug);
                                       
        %EXPORT_TAGS = ();
        @ISA         = qw(Exporter);
        $VERSION     = 0.1;
}

my $debug = 0;

=head1 NAME

Data::Mining:AssociationRules - Mine association rules and frequent
sets from data.

=head1 SYNOPSIS

 use Data::Mining::AssociationRules;

 my %transaction_map;
 my $transaction_file = "foo.txt";

 read_transaction_file(\%transaction_map, $transaction_file);

 generate_frequent_sets(\%transaction_map, $output_file_prefix,
                        $support_threshold, $max_n);

 generate_rules($output_file_prefix, $support_threshold,
                $confidence_threshold, $max_n);

 read_frequent_sets($set_map_ref, $file_prefix)

 set_debug(1);

 perl arm.pl -transaction-file foo.txt -support 2 -confidence-threshold 0.01 -max-set-size 6

 See also FUNCTIONS, DESCRIPTION, and EXAMPLES below.

=head1 INSTALLATION

The typical:

=over

=item 0 perl Makefile.PL

=item 0 make test

=item 0 make install

=back

=head1 FUNCTIONS

=cut

=pod

=head2 read_transaction_file($transaction_map_ref, $transaction_file)

Read in a transaction map from a file which has lines of two
whitespace-separated columns:

=over

 transaction-id item-id

=back

=cut

sub read_transaction_file {
  my $transaction_map_ref = shift;
  my $transaction_file = shift;

  open(BFILE, $transaction_file) or die "Couldn't open $transaction_file: $!\n";
  while ( <BFILE> ) {
    my @data = split;
    die "Expected 2 columns, found ", int(@data), "\n" if int(@data) != 2;
    my ($tid, $item) = @data;
    $$transaction_map_ref{$item}{$tid}++;
  }
  close(BFILE);
}

=pod

=head2 generate_frequent_sets ($transaction_map_ref, $file_prefix, $support_threshold, $max_n)

Given

=over

=item 0 a map of transactions

=item 0 a file prefix

=item 0 a support threshold

=item 0 a maximum frequent set size to look for (optional)

=back

generate the frequent sets in some files, one file per size of the set.
That is, all 1-sets are in a file, all 2-sets in another, etc.

The files are lines of the form:

=over

 support-count item-set

=back

where

=over

=item 0 support-count is the number of transactions in which the item-set appears

=item 0 item-set is one or more space-separated items

=back

=cut

sub generate_frequent_sets {
  my $transaction_map_ref = shift;
  my $file_prefix = shift;
  my $support_threshold = shift;
  my $max_n = shift;

  # Generate 1-sets
  my $n = 1;
  my $out_nset = nset_filename($n, $file_prefix, $support_threshold);
  open(OUT, ">$out_nset") or die "Couldn't open $out_nset for writing: $!\n";
  while (my ($item, $item_map) = each %{$transaction_map_ref}) {
    my $support = int(keys(%$item_map));
    if ($support >= $support_threshold) {
      print OUT "$support $item\n";
    }
  }
  my $num_nsets = int(keys(%{$transaction_map_ref}));
  print STDERR "$num_nsets $n-sets\n" if $debug;
  close(OUT);

  # Generate n-sets
  my $done = 0;
  while ($num_nsets > 0) {
    $n++;
    $num_nsets = 0;

    last if defined($max_n) && ($n > $max_n);

    # Go through (n-1)-sets, pruning as you go
    my $prior_nset = nset_filename($n-1, $file_prefix, $support_threshold);
    open(PRIOR, $prior_nset) or die "Couldn't open $prior_nset: $!\n";
    $out_nset = nset_filename($n, $file_prefix, $support_threshold);
    open(OUT, ">$out_nset") or die "Couldn't open $out_nset: $!\n";
    while ( <PRIOR> ) {
      my ($count, @set) = split;
      
      # Create userset, which contains the intersection of $transaction{@set}
      my %userset = % {$$transaction_map_ref{$set[0]}};
      foreach my $item ( @set[1 .. $#set] ) {
        while (my ($user, $dummy) = each %userset) {
          if (!exists($$transaction_map_ref{$item}{$user})) {
            delete($userset{$user});
          }
        }
      }
      
      # For each 1-set, intersect further, and spit out if > support_threshold
      while (my ($item, $user_set) = each %{$transaction_map_ref}) {
	# Only spit sets of non-decreasing elements
	# This keeps out duplicates
	my $dup_set = 0;
	foreach my $set_item ( @set ) {
          if ($set_item ge $item) {
            $dup_set = 1;
            last;
          }
	}
        
	if (!$dup_set) {
          my %newset = %userset;
          while (my ($user, $dummy) = each %newset) {
            if (!exists($$user_set{$user})) {
              delete($newset{$user});
            }
          }
          #print "newset is now " . map_str(\%newset) . "\n";
          my $num_users = int(keys(%newset));
          #print "item $item set @set numusers is $num_users\n";
          if ($num_users >= $support_threshold) {
            print OUT "$num_users @set $item\n";
            $num_nsets++;
          }
	}
      }
    }
    close(PRIOR);
    close(OUT);
    print STDERR "$num_nsets $n-sets\n" if ($num_nsets > 0) && $debug;
    unlink($out_nset) if 0 == $num_nsets;
  }
}

=pod

=head2 read_frequent_sets($set_map_ref, $file_prefix)

Given

=over

=item 0 a set map

=item 0 a file prefix

=item 0 support threshold

=item 0 max frequent set size (optional)

=back

read all the frequent sets into a single map, which has as its key the
frequent set (joined by single spaces) and as its value the support.

=cut

sub read_frequent_sets {
  my $set_map_ref = shift;
  my $file_prefix = shift;
  my $support_threshold = shift;
  my $max_n = shift;

  opendir(DIR, '.') || die "can't opendir '.': $!";
  my @files = grep { /^$file_prefix/ && -f "./$_" } readdir(DIR);
  closedir DIR;

  foreach my $file (@files) {
    # print STDERR "Read file $file ..\n";
    if ( $file =~ /${file_prefix}\-support\-(\d+)\-(\d+)set/ ) {
      my $support = $1;
      my $n = $2;
      next if ($support != $support_threshold)
              || (defined($max_n) && ($n > $max_n));

      open(SETS, $file) or die "Couldn't open $file: $!\n";
      while ( <SETS> ) {
        my ($count, @set) = split;
        $$set_map_ref{join(' ', @set)} = $count;
      }
      close(SETS);
    }
  }
}

# =pod

# =head2 nset_filename($n, $file_prefix, $support_threshold)

# Given

# =over

# =item 0 set size

# =item 0 a file prefix

# =item 0 a support threshold

# =back

# return the name of the file that contains the specified frequent sets.

# =cut
sub nset_filename {
  my $n = shift;
  my $file_prefix = shift;
  my $support_threshold = shift;

  return $file_prefix . "-support-" . $support_threshold . "-" . $n . "set.txt";
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -