📄 knowledgeset.pm
字号:
package AI::Categorizer::KnowledgeSet;use strict;use Class::Container;use AI::Categorizer::Storable;use base qw(Class::Container AI::Categorizer::Storable);use Params::Validate qw(:types);use AI::Categorizer::ObjectSet;use AI::Categorizer::Document;use AI::Categorizer::Category;use AI::Categorizer::FeatureVector;use AI::Categorizer::Util;use Carp qw(croak);__PACKAGE__->valid_params ( categories => { type => ARRAYREF, default => [], callbacks => { 'all are Category objects' => sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'), @{$_[0]} }, }, }, documents => { type => ARRAYREF, default => [], callbacks => { 'all are Document objects' => sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'), @{$_[0]} }, }, }, scan_first => { type => BOOLEAN, default => 1, }, feature_selector => { isa => 'AI::Categorizer::FeatureSelector', }, tfidf_weighting => { type => SCALAR, optional => 1, }, term_weighting => { type => SCALAR, default => 'x', }, collection_weighting => { type => SCALAR, default => 'x', }, normalize_weighting => { type => SCALAR, default => 'x', }, verbose => { type => SCALAR, default => 0, }, );__PACKAGE__->contained_objects ( document => { delayed => 1, class => 'AI::Categorizer::Document' }, category => { delayed => 1, class => 'AI::Categorizer::Category' }, collection => { delayed => 1, class => 'AI::Categorizer::Collection::Files' }, features => { delayed => 1, class => 'AI::Categorizer::FeatureVector' }, feature_selector => 'AI::Categorizer::FeatureSelector::DocFrequency', );sub new { my ($pkg, %args) = @_; # Shortcuts if ($args{tfidf_weighting}) { @args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting}; delete $args{tfidf_weighting}; } my $self = $pkg->SUPER::new(%args); # Convert to AI::Categorizer::ObjectSet sets $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} ); $self->{documents} = new AI::Categorizer::ObjectSet( @{$self->{documents}} ); if ($self->{load}) { my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} }; $self->load(%$args); delete $self->{load}; } return $self;}sub features { my $self = shift; if (@_) { $self->{features} = shift; $self->trim_doc_features if $self->{features}; } return $self->{features} if $self->{features}; # Create a feature vector encompassing the whole set of documents my $v = $self->create_delayed_object('features'); foreach my $document ($self->documents) { $v->add( $document->features ); } return $self->{features} = $v;}sub categories { my $c = $_[0]->{categories}; return wantarray ? $c->members : $c->size;}sub documents { my $d = $_[0]->{documents}; return wantarray ? $d->members : $d->size;}sub document { my ($self, $name) = @_; return $self->{documents}->retrieve($name);}sub feature_selector { $_[0]->{feature_selector} }sub scan_first { $_[0]->{scan_first} }sub verbose { my $self = shift; $self->{verbose} = shift if @_; return $self->{verbose};}sub trim_doc_features { my ($self) = @_; foreach my $doc ($self->documents) { $doc->features( $doc->features->intersection($self->features) ); }}sub prog_bar { my ($self, $collection) = @_; return sub {} unless $self->verbose; return sub { print STDERR '.' } unless eval "use Time::Progress; 1"; my $count = $collection->can('count_documents') ? $collection->count_documents : 0; my $pb = 'Time::Progress'->new; $pb->attr(max => $count); my $i = 0; return sub { $i++; return if $i % 25; print STDERR $pb->report("%50b %p ($i/$count)\r", $i); };}# A little utility method for several other methods like scan_stats(),# load(), read(), etc.sub _make_collection { my ($self, $args) = @_; return $args->{collection} || $self->create_delayed_object('collection', %$args);}sub scan_stats { # Should determine: # - number of documents # - number of categories # - avg. number of categories per document (whole corpus) # - avg. number of tokens per document (whole corpus) # - avg. number of types per document (whole corpus) # - number of documents, tokens, & types for each category # - "category skew index" (% variance?) by num. documents, tokens, and types my ($self, %args) = @_; my $collection = $self->_make_collection(\%args); my $pb = $self->prog_bar($collection); my %stats; while (my $doc = $collection->next) { $pb->(); $stats{category_count_with_duplicates} += $doc->categories; my ($sum, $length) = ($doc->features->sum, $doc->features->length); $stats{document_count}++; $stats{token_count} += $sum; $stats{type_count} += $length; foreach my $cat ($doc->categories) {#warn $doc->name, ": ", $cat->name, "\n"; $stats{categories}{$cat->name}{document_count}++; $stats{categories}{$cat->name}{token_count} += $sum; $stats{categories}{$cat->name}{type_count} += $length; } } print "\n" if $self->verbose; my @cats = keys %{ $stats{categories} }; $stats{category_count} = @cats; $stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count}; $stats{tokens_per_document} = $stats{token_count} / $stats{document_count}; $stats{types_per_document} = $stats{type_count} / $stats{document_count}; foreach my $thing ('type', 'token', 'document') { $stats{"${thing}s_per_category"} = AI::Categorizer::Util::average ( map { $stats{categories}{$_}{"${thing}_count"} } @cats ); next unless @cats; # Compute the skews my $ssum; foreach my $cat (@cats) { $ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2; } $stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"}; } return \%stats;}sub load { my ($self, %args) = @_; my $c = $self->_make_collection(\%args); if ($self->{features_kept}) { # Read the whole thing in, then reduce $self->read( collection => $c ); $self->select_features; } elsif ($self->{scan_first}) { # Figure out the feature set first, then read data in $self->scan_features( collection => $c ); $c->rewind; $self->read( collection => $c ); } else { # Don't do any feature reduction, just read the data $self->read( collection => $c ); }}sub read { my ($self, %args) = @_; my $collection = $self->_make_collection(\%args); my $pb = $self->prog_bar($collection); while (my $doc = $collection->next) { $pb->(); $self->add_document($doc); } print "\n" if $self->verbose;}sub finish { my $self = shift; return if $self->{finished}++; $self->weigh_features;}sub weigh_features { # This could be made more efficient by figuring out an execution # plan in advance my $self = shift; if ( $self->{term_weighting} =~ /^(t|x)$/ ) { # Nothing to do } elsif ( $self->{term_weighting} eq 'l' ) { foreach my $doc ($self->documents) { my $f = $doc->features->as_hash; $_ = 1 + log($_) foreach values %$f; } } elsif ( $self->{term_weighting} eq 'n' ) { foreach my $doc ($self->documents) { my $f = $doc->features->as_hash; my $max_tf = AI::Categorizer::Util::max values %$f; $_ = 0.5 + 0.5 * $_ / $max_tf foreach values %$f; } } elsif ( $self->{term_weighting} eq 'b' ) { foreach my $doc ($self->documents) { my $f = $doc->features->as_hash; $_ = $_ ? 1 : 0 foreach values %$f; } } else { die "term_weighting must be one of 'x', 't', 'l', 'b', or 'n'"; } if ($self->{collection_weighting} eq 'x') { # Nothing to do } elsif ($self->{collection_weighting} =~ /^(f|p)$/) { my $subtrahend = ($1 eq 'f' ? 0 : 1); my $num_docs = $self->documents; $self->document_frequency('foo'); # Initialize foreach my $doc ($self->documents) { my $f = $doc->features->as_hash; $f->{$_} *= log($num_docs / $self->{doc_freq_vector}{$_} - $subtrahend) foreach keys %$f; } } else { die "collection_weighting must be one of 'x', 'f', or 'p'"; } if ( $self->{normalize_weighting} eq 'x' ) { # Nothing to do } elsif ( $self->{normalize_weighting} eq 'c' ) { $_->features->normalize foreach $self->documents; } else { die "normalize_weighting must be one of 'x' or 'c'"; }}sub document_frequency { my ($self, $term) = @_; unless (exists $self->{doc_freq_vector}) { die "No corpus has been scanned for features" unless $self->documents; my $doc_freq = $self->create_delayed_object('features', features => {}); foreach my $doc ($self->documents) { $doc_freq->add( $doc->features->as_boolean_hash ); } $self->{doc_freq_vector} = $doc_freq->as_hash; } return exists $self->{doc_freq_vector}{$term} ? $self->{doc_freq_vector}{$term} : 0;}sub scan_features { my ($self, %args) = @_; my $c = $self->_make_collection(\%args); my $pb = $self->prog_bar($c); my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb ); $self->delayed_object_params('document', use_features => $ranked_features); $self->delayed_object_params('collection', use_features => $ranked_features); return $ranked_features;}sub select_features { my $self = shift; my $f = $self->feature_selector->select_features(knowledge_set => $self); $self->features($f);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -