⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 knowledgeset.pm

📁 AI::Categorizer is a framework for automatic text categorization. It consists of a collection of Per
💻 PM
📖 第 1 页 / 共 2 页
字号:
package AI::Categorizer::KnowledgeSet;use strict;use Class::Container;use AI::Categorizer::Storable;use base qw(Class::Container AI::Categorizer::Storable);use Params::Validate qw(:types);use AI::Categorizer::ObjectSet;use AI::Categorizer::Document;use AI::Categorizer::Category;use AI::Categorizer::FeatureVector;use AI::Categorizer::Util;use Carp qw(croak);__PACKAGE__->valid_params  (   categories => {		  type => ARRAYREF,		  default => [],		  callbacks => { 'all are Category objects' => 				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Category'),					 @{$_[0]} },			       },		 },   documents  => {		  type => ARRAYREF,		  default => [],		  callbacks => { 'all are Document objects' => 				 sub { ! grep !UNIVERSAL::isa($_, 'AI::Categorizer::Document'),					 @{$_[0]} },			       },		 },   scan_first => {		  type => BOOLEAN,		  default => 1,		 },   feature_selector => {			isa => 'AI::Categorizer::FeatureSelector',		       },   tfidf_weighting  => {			type => SCALAR,			optional => 1,		       },   term_weighting  => {		       type => SCALAR,		       default => 'x',		      },   collection_weighting => {			    type => SCALAR,			    default => 'x',			   },   normalize_weighting => {			   type => SCALAR,			   default => 'x',			  },   verbose => {	       type => SCALAR,	       default => 0,	      },  );__PACKAGE__->contained_objects  (   document => { delayed => 1,		 class => 'AI::Categorizer::Document' },   category => { delayed => 1,		 class => 'AI::Categorizer::Category' },   collection => { delayed => 1,		   class => 'AI::Categorizer::Collection::Files' },   features => { delayed => 1,		 class => 'AI::Categorizer::FeatureVector' },   feature_selector => 'AI::Categorizer::FeatureSelector::DocFrequency',  );sub new {  my ($pkg, %args) = @_;    # Shortcuts  if ($args{tfidf_weighting}) {    @args{'term_weighting', 'collection_weighting', 'normalize_weighting'} = split '', $args{tfidf_weighting};    delete $args{tfidf_weighting};  }  my $self = $pkg->SUPER::new(%args);  # Convert to AI::Categorizer::ObjectSet sets  $self->{categories} = new AI::Categorizer::ObjectSet( @{$self->{categories}} );  $self->{documents}  = new AI::Categorizer::ObjectSet( @{$self->{documents}}  );  if ($self->{load}) {    my $args = ref($self->{load}) ? $self->{load} : { path => $self->{load} };    $self->load(%$args);    delete $self->{load};  }  return $self;}sub features {  my $self = shift;  if (@_) {    $self->{features} = shift;    $self->trim_doc_features if $self->{features};  }  return $self->{features} if $self->{features};  # Create a feature vector encompassing the whole set of documents  my $v = $self->create_delayed_object('features');  foreach my $document ($self->documents) {    $v->add( $document->features );  }  return $self->{features} = $v;}sub categories {  my $c = $_[0]->{categories};  return wantarray ? $c->members : $c->size;}sub documents {  my $d = $_[0]->{documents};  return wantarray ? $d->members : $d->size;}sub document {  my ($self, $name) = @_;  return $self->{documents}->retrieve($name);}sub feature_selector { $_[0]->{feature_selector} }sub scan_first       { $_[0]->{scan_first} }sub verbose {  my $self = shift;  $self->{verbose} = shift if @_;  return $self->{verbose};}sub trim_doc_features {  my ($self) = @_;    foreach my $doc ($self->documents) {    $doc->features( $doc->features->intersection($self->features) );  }}sub prog_bar {  my ($self, $collection) = @_;  return sub {} unless $self->verbose;  return sub { print STDERR '.' } unless eval "use Time::Progress; 1";  my $count = $collection->can('count_documents') ? $collection->count_documents : 0;    my $pb = 'Time::Progress'->new;  $pb->attr(max => $count);  my $i = 0;  return sub {    $i++;    return if $i % 25;    print STDERR $pb->report("%50b %p ($i/$count)\r", $i);  };}# A little utility method for several other methods like scan_stats(),# load(), read(), etc.sub _make_collection {  my ($self, $args) = @_;  return $args->{collection} || $self->create_delayed_object('collection', %$args);}sub scan_stats {  # Should determine:  #  - number of documents  #  - number of categories  #  - avg. number of categories per document (whole corpus)  #  - avg. number of tokens per document (whole corpus)  #  - avg. number of types per document (whole corpus)  #  - number of documents, tokens, & types for each category  #  - "category skew index" (% variance?) by num. documents, tokens, and types  my ($self, %args) = @_;  my $collection = $self->_make_collection(\%args);  my $pb = $self->prog_bar($collection);  my %stats;  while (my $doc = $collection->next) {    $pb->();    $stats{category_count_with_duplicates} += $doc->categories;    my ($sum, $length) = ($doc->features->sum, $doc->features->length);    $stats{document_count}++;    $stats{token_count} += $sum;    $stats{type_count}  += $length;        foreach my $cat ($doc->categories) {#warn $doc->name, ": ", $cat->name, "\n";      $stats{categories}{$cat->name}{document_count}++;      $stats{categories}{$cat->name}{token_count} += $sum;      $stats{categories}{$cat->name}{type_count}  += $length;    }  }  print "\n" if $self->verbose;  my @cats = keys %{ $stats{categories} };  $stats{category_count}          = @cats;  $stats{categories_per_document} = $stats{category_count_with_duplicates} / $stats{document_count};  $stats{tokens_per_document}     = $stats{token_count} / $stats{document_count};  $stats{types_per_document}      = $stats{type_count}  / $stats{document_count};  foreach my $thing ('type', 'token', 'document') {    $stats{"${thing}s_per_category"} = AI::Categorizer::Util::average      ( map { $stats{categories}{$_}{"${thing}_count"} } @cats );    next unless @cats;    # Compute the skews    my $ssum;    foreach my $cat (@cats) {      $ssum += ($stats{categories}{$cat}{"${thing}_count"} - $stats{"${thing}s_per_category"}) ** 2;    }    $stats{"${thing}_skew_by_category"} = sqrt($ssum/@cats) / $stats{"${thing}s_per_category"};  }  return \%stats;}sub load {  my ($self, %args) = @_;  my $c = $self->_make_collection(\%args);  if ($self->{features_kept}) {    # Read the whole thing in, then reduce    $self->read( collection => $c );    $self->select_features;  } elsif ($self->{scan_first}) {    # Figure out the feature set first, then read data in    $self->scan_features( collection => $c );    $c->rewind;    $self->read( collection => $c );  } else {    # Don't do any feature reduction, just read the data    $self->read( collection => $c );  }}sub read {  my ($self, %args) = @_;  my $collection = $self->_make_collection(\%args);  my $pb = $self->prog_bar($collection);    while (my $doc = $collection->next) {    $pb->();    $self->add_document($doc);  }  print "\n" if $self->verbose;}sub finish {  my $self = shift;  return if $self->{finished}++;  $self->weigh_features;}sub weigh_features {  # This could be made more efficient by figuring out an execution  # plan in advance  my $self = shift;    if ( $self->{term_weighting} =~ /^(t|x)$/ ) {    # Nothing to do  } elsif ( $self->{term_weighting} eq 'l' ) {    foreach my $doc ($self->documents) {      my $f = $doc->features->as_hash;      $_ = 1 + log($_) foreach values %$f;    }  } elsif ( $self->{term_weighting} eq 'n' ) {    foreach my $doc ($self->documents) {      my $f = $doc->features->as_hash;      my $max_tf = AI::Categorizer::Util::max values %$f;      $_ = 0.5 + 0.5 * $_ / $max_tf foreach values %$f;    }  } elsif ( $self->{term_weighting} eq 'b' ) {    foreach my $doc ($self->documents) {      my $f = $doc->features->as_hash;      $_ = $_ ? 1 : 0 foreach values %$f;    }  } else {    die "term_weighting must be one of 'x', 't', 'l', 'b', or 'n'";  }    if ($self->{collection_weighting} eq 'x') {    # Nothing to do  } elsif ($self->{collection_weighting} =~ /^(f|p)$/) {    my $subtrahend = ($1 eq 'f' ? 0 : 1);    my $num_docs = $self->documents;    $self->document_frequency('foo');  # Initialize    foreach my $doc ($self->documents) {      my $f = $doc->features->as_hash;      $f->{$_} *= log($num_docs / $self->{doc_freq_vector}{$_} - $subtrahend) foreach keys %$f;    }  } else {    die "collection_weighting must be one of 'x', 'f', or 'p'";  }  if ( $self->{normalize_weighting} eq 'x' ) {    # Nothing to do  } elsif ( $self->{normalize_weighting} eq 'c' ) {    $_->features->normalize foreach $self->documents;  } else {    die "normalize_weighting must be one of 'x' or 'c'";  }}sub document_frequency {  my ($self, $term) = @_;    unless (exists $self->{doc_freq_vector}) {    die "No corpus has been scanned for features" unless $self->documents;    my $doc_freq = $self->create_delayed_object('features', features => {});    foreach my $doc ($self->documents) {      $doc_freq->add( $doc->features->as_boolean_hash );    }    $self->{doc_freq_vector} = $doc_freq->as_hash;  }    return exists $self->{doc_freq_vector}{$term} ? $self->{doc_freq_vector}{$term} : 0;}sub scan_features {  my ($self, %args) = @_;  my $c = $self->_make_collection(\%args);  my $pb = $self->prog_bar($c);  my $ranked_features = $self->{feature_selector}->scan_features( collection => $c, prog_bar => $pb );  $self->delayed_object_params('document', use_features => $ranked_features);  $self->delayed_object_params('collection', use_features => $ranked_features);  return $ranked_features;}sub select_features {  my $self = shift;    my $f = $self->feature_selector->select_features(knowledge_set => $self);  $self->features($f);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -