📄 weka.pm
字号:
package AI::Categorizer::Learner::Weka;use strict;use AI::Categorizer::Learner::Boolean;use base qw(AI::Categorizer::Learner::Boolean);use Params::Validate qw(:types);use File::Spec;use File::Copy;use File::Path ();use File::Temp ();__PACKAGE__->valid_params ( java_path => {type => SCALAR, default => 'java'}, java_args => {type => SCALAR|ARRAYREF, optional => 1}, weka_path => {type => SCALAR, optional => 1}, weka_classifier => {type => SCALAR, default => 'weka.classifiers.NaiveBayes'}, weka_args => {type => SCALAR|ARRAYREF, optional => 1}, tmpdir => {type => SCALAR, default => File::Spec->tmpdir}, );__PACKAGE__->contained_objects ( features => {class => 'AI::Categorizer::FeatureVector', delayed => 1}, );sub new { my $class = shift; my $self = $class->SUPER::new(@_); for ('java_args', 'weka_args') { $self->{$_} = [] unless defined $self->{$_}; $self->{$_} = [$self->{$_}] unless UNIVERSAL::isa($self->{$_}, 'ARRAY'); } if (defined $self->{weka_path}) { push @{$self->{java_args}}, '-classpath', $self->{weka_path}; delete $self->{weka_path}; } return $self;}# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -t /tmp/train_file.arff -d /tmp/weka-machinesub create_model { my ($self) = shift; my $m = $self->{model} ||= {}; $m->{all_features} = [ $self->knowledge_set->features->names ]; $m->{_in_dir} = File::Temp::tempdir( DIR => $self->{tmpdir} ); # Create a dummy test file $dummy_file in ARFF format (a kludgey WEKA requirement) my $dummy_features = $self->create_delayed_object('features'); $m->{dummy_file} = $self->create_arff_file("dummy", [[$dummy_features, 0]]); $self->SUPER::create_model(@_);}sub create_boolean_model { my ($self, $pos, $neg, $cat) = @_; my @docs = (map([$_->features, 1], @$pos), map([$_->features, 0], @$neg)); my $train_file = $self->create_arff_file($cat->name . '_train', \@docs); my %info = (machine_file => $cat->name . '_model'); my $outfile = File::Spec->catfile($self->{model}{_in_dir}, $info{machine_file}); my @args = ($self->{java_path}, @{$self->{java_args}}, $self->{weka_classifier}, @{$self->{weka_args}}, '-t', $train_file, '-T', $self->{model}{dummy_file}, '-d', $outfile, '-v', '-p', '0', ); $self->do_cmd(@args); unlink $train_file or warn "Couldn't remove $train_file: $!"; return \%info;}# java -classpath /Applications/Science/weka-3-2-3/weka.jar weka.classifiers.NaiveBayes -l out -T test.arff -p 0sub get_boolean_score { my ($self, $doc, $info) = @_; # Create document file my $doc_file = $self->create_arff_file('doc', [[$doc->features, 0]], $self->{tmpdir}); my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, $info->{machine_file}); my @args = ($self->{java_path}, @{$self->{java_args}}, $self->{weka_classifier}, '-l', $machine_file, '-T', $doc_file, '-p', 0, ); my @output = $self->do_cmd(@args); my %scores; foreach (@output) { # <doc> <category> <score> <real_category> # 0 large.elem 0.4515551620220952 numberth.high next unless my ($index, $predicted, $score) = /^([\d.]+)\s+(\S+)\s+([\d.]+)/; $scores{$predicted} = $score; } return $scores{1} || 0; # Not sure what weka's scores represent...}sub categorize_collection { my ($self, %args) = @_; my $c = $args{collection} or die "No collection provided"; my @alldocs; while (my $d = $c->next) { push @alldocs, $d; } my $doc_file = $self->create_arff_file("docs", [map [$_->features, 0], @alldocs]); my @assigned; my $l = $self->{model}{learners}; foreach my $cat (keys %$l) { my $machine_file = File::Spec->catfile($self->{model}{_in_dir}, "${cat}_model"); my @args = ($self->{java_path}, @{$self->{java_args}}, $self->{weka_classifier}, '-l', $machine_file, '-T', $doc_file, '-p', 0, ); my @output = $self->do_cmd(@args); foreach my $line (@output) { next unless $line =~ /\S/; # 0 large.elem 0.4515551620220952 numberth.high unless ( $line =~ /^([\d.]+)\s+(\S+)\s+([\d.]+)\s+(\S+)/ ) { warn "Can't parse line $line"; next; } my ($index, $predicted, $score) = ($1, $2, $3); $assigned[$index]{$cat} = $score if $predicted; # Not sure what weka's scores represent print STDERR "$index: assigned=($predicted) correct=(", $alldocs[$index]->is_in_category($cat) ? 1 : 0, ")\n" if $self->verbose; } } my $experiment = $self->create_delayed_object('experiment', categories => [map $_->name, $self->categories]); foreach my $i (0..$#alldocs) { $experiment->add_result([keys %{$assigned[$i]}], [map $_->name, $alldocs[$i]->categories], $alldocs[$i]->name); } return $experiment;}sub do_cmd { my ($self, @cmd) = @_; print STDERR " % @cmd\n" if $self->verbose; my @output; local *KID_TO_READ; my $pid = open(KID_TO_READ, "-|"); if ($pid) { # parent @output = <KID_TO_READ>; close(KID_TO_READ) or warn "@cmd exited $?"; } else { # child exec(@cmd) or die "Can't exec @cmd: $!"; } return @output;}sub create_arff_file { my ($self, $name, $docs, $dir) = @_; $dir = $self->{model}{_in_dir} unless defined $dir; my ($fh, $filename) = File::Temp::tempfile( $name . "_XXXX", # Template DIR => $dir, SUFFIX => '.arff', ); print $fh "\@RELATION foo\n\n"; my $feature_names = $self->{model}{all_features}; foreach my $name (@$feature_names) { print $fh "\@ATTRIBUTE feature-$name REAL\n"; } print $fh "\@ATTRIBUTE category {1, 0}\n\n"; my %feature_indices = map {$feature_names->[$_], $_} 0..$#{$feature_names}; my $last_index = keys %feature_indices; # We use the 'sparse' format, see http://www.cs.waikato.ac.nz/~ml/weka/arff.html print $fh "\@DATA\n"; foreach my $doc (@$docs) { my ($features, $cat) = @$doc; my $f = $features->as_hash; my @ordered_keys = (sort {$feature_indices{$a} <=> $feature_indices{$b}} grep {exists $feature_indices{$_}} keys %$f); print $fh ("{", join(', ', map("$feature_indices{$_} $f->{$_}", @ordered_keys), "$last_index '$cat'"), "}\n" ); } return $filename;}sub save_state { my ($self, $path) = @_; { local $self->{knowledge_set}; $self->SUPER::save_state($path); } return unless $self->{model}; my $model_dir = File::Spec->catdir($path, 'models'); mkdir($model_dir, 0777) or die "Couldn't create $model_dir: $!"; while (my ($name, $learner) = each %{$self->{model}{learners}}) { my $oldpath = File::Spec->catdir($self->{model}{_in_dir}, $learner->{machine_file}); my $newpath = File::Spec->catfile($model_dir, "${name}_model"); File::Copy::copy($oldpath, $newpath); } $self->{model}{_in_dir} = $model_dir;}sub restore_state { my ($pkg, $path) = @_; my $self = $pkg->SUPER::restore_state($path); my $model_dir = File::Spec->catdir($path, 'models'); return $self unless -e $model_dir; $self->{model}{_in_dir} = $model_dir; return $self;}1;__END__=head1 NAMEAI::Categorizer::Learner::Weka - Pass-through wrapper to Weka system=head1 SYNOPSIS use AI::Categorizer::Learner::Weka; # Here $k is an AI::Categorizer::KnowledgeSet object my $nb = new AI::Categorizer::Learner::Weka(...parameters...); $nb->train(knowledge_set => $k); $nb->save_state('filename'); ... time passes ... $nb = AI::Categorizer::Learner->restore_state('filename'); my $c = new AI::Categorizer::Collection::Files( path => ... ); while (my $document = $c->next) { my $hypothesis = $nb->categorize($document); print "Best assigned category: ", $hypothesis->best_category, "\n"; }=head1 DESCRIPTIONThis class doesn't implement any machine learners of its own, itmerely passes the data through to the Weka machine learning system(http://www.cs.waikato.ac.nz/~ml/weka/). This can give you access toa collection of machine learning algorithms not otherwise implementedin C<AI::Categorizer>.Currently this is a simple command-line wrapper that calls C<java>subprocesses. In the future this may be converted to anC<Inline::Java> wrapper for better performance (faster runningtimes). However, if you're looking for really great performance,you're probably looking in the wrong place - this Weka wrapper isintended more as a way to try lots of different machine learningmethods.=head1 METHODSThis class inherits from the C<AI::Categorizer::Learner> class, so allof its methods are available unless explicitly mentioned here.=head2 new()Creates a new Weka Learner and returns it. In addition to theparameters accepted by the C<AI::Categorizer::Learner> class, theWeka subclass accepts the following parameters:=over 4=item java_pathSpecifies where the C<java> executable can be found on this system.The default is simply C<java>, meaning that it will search yourC<PATH> to find java.=item java_argsSpecifies a list of any additional arguments to give to the javaprocess. Commonly it's necessary to allocate more memory than thedefault, using an argument like C<-Xmx130MB>.=item weka_pathSpecifies the path to the C<weka.jar> file containing the Wekabytecode. If Weka has been installed somewhere in your javaC<CLASSPATH>, you needn't specify a C<weka_path>.=item weka_classifierSpecifies the Weka class to use for a categorizer. The default isC<weka.classifiers.NaiveBayes>. Consult your Weka documentation for alist of other classifiers available.=item weka_argsSpecifies a list of any additional arguments to pass to the Wekaclassifier class when building the categorizer.=item tmpdirA directory in which temporary files will be written when training thecategorizer and categorizing new documents. The default is given byC<< File::Spec->tmpdir >>.=back=head2 train(knowledge_set => $k)Trains the categorizer. This prepares it for later use incategorizing documents. The C<knowledge_set> parameter must providean object of the class C<AI::Categorizer::KnowledgeSet> (or a subclassthereof), populated with lots of documents and categories. SeeL<AI::Categorizer::KnowledgeSet> for the details of how to create suchan object.=head2 categorize($document)Returns an C<AI::Categorizer::Hypothesis> object representing thecategorizer's "best guess" about which categories the given documentshould be assigned to. See L<AI::Categorizer::Hypothesis> for moredetails on how to use this object.=head2 save_state($path)Saves the categorizer for later use. This method is inherited fromC<AI::Categorizer::Storable>.=head1 AUTHORKen Williams, ken@mathforum.org=head1 COPYRIGHTCopyright 2000-2003 Ken Williams. All rights reserved.This library is free software; you can redistribute it and/ormodify it under the same terms as Perl itself.=head1 SEE ALSOAI::Categorizer(3)=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -