📄 benchmarkingindexer.pm
字号:
package BenchmarkingIndexer;use strict;use warnings;use Carp;use Config;use File::Spec::Functions qw( catfile catdir );use POSIX qw( uname );sub new { my $either = shift; my $class = ref($either) || $either; return bless { docs => undef, increment => undef, store => undef, engine => undef, version => undef, index_dir => undef, corpus_dir => 'extracted_corpus', article_filepaths => undef, @_, }, $class;}sub init_indexer { confess "abstract method" }sub build_index { confess "abstract method" }sub delayed_init { my $self = shift; my $article_filepaths = $self->{article_filepaths} = $self->build_file_list; $self->{docs} = @$article_filepaths unless defined $self->{docs}; $self->{increment} = $self->{docs} + 1 unless defined $self->{increment};}# Return a lexically sorted list of all article files from all subdirs.sub build_file_list { my $self = shift; my $corpus_dir = $self->{corpus_dir}; my @article_filepaths; opendir CORPUS_DIR, $corpus_dir or confess "Can't opendir '$corpus_dir': $!"; my @article_dir_names = grep {/articles/} readdir CORPUS_DIR; for my $article_dir_name (@article_dir_names) { my $article_dir = catdir( $corpus_dir, $article_dir_name ); opendir ARTICLE_DIR, $article_dir or die "Can't opendir '$article_dir': $!"; push @article_filepaths, map { catfile( $article_dir, $_ ) } grep {m/^article\d+\.txt$/} readdir ARTICLE_DIR; } @article_filepaths = sort @article_filepaths; $self->{article_filepaths} = \@article_filepaths;}# Print out stats for one run.sub print_interim_report { my ( $self, %args ) = @_; printf( "%-3d Secs: %.2f Docs: %-4d\n", @args{qw( rep secs count )} );}sub start_report { # start the output print '-' x 60 . "\n";}# Print out aggregate statssub print_final_report { my ( $self, $times ) = @_; # produce mean and truncated mean my @sorted_times = sort @$times; my $num_to_chop = int( @sorted_times >> 2 ); my $mean = 0; my $trunc_mean = 0; my $num_kept = 0; for ( my $i = 0; $i < @sorted_times; $i++ ) { $mean += $sorted_times[$i]; # discard fastest 25% and slowest 25% of runs next if $i < $num_to_chop; next if $i > ( $#sorted_times - $num_to_chop ); $trunc_mean += $sorted_times[$i]; $num_kept++; } $mean /= @sorted_times; $trunc_mean /= $num_kept; my $num_discarded = @sorted_times - $num_kept; $mean = sprintf("%.2f", $mean); $trunc_mean = sprintf("%.2f", $trunc_mean); # get some info about the system my $thread_support = $Config{usethreads} ? "yes" : "no"; my @uname_info = (uname)[0, 2, 4]; print <<END_REPORT;------------------------------------------------------------$self->{engine} $self->{version} Perl $Config{version}Thread support: $thread_support@uname_infoMean: $mean secs Truncated mean ($num_kept kept, $num_discarded discarded): $trunc_mean secs------------------------------------------------------------END_REPORT}package BenchmarkingIndexer::KinoSearch;use strict;use warnings;use base qw( BenchmarkingIndexer );use Time::HiRes qw( gettimeofday );sub new { my $class = shift; my $self = $class->SUPER::new(@_); require KinoSearch; require KinoSearch::InvIndexer; require KinoSearch::Analysis::Tokenizer; $self->{index_dir} = 'kinosearch_index'; $self->{engine} = 'KinoSearch'; $self->{version} = $KinoSearch::VERSION; return $self;}sub init_indexer { my ( $self, $count ) = @_; my $create = $count ? 0 : 1; # spec out the invindexer my $analyzer = KinoSearch::Analysis::Tokenizer->new( token_re => qr/\S+/, ); my $invindexer = KinoSearch::InvIndexer->new( invindex => $self->{index_dir}, create => $create, analyzer => $analyzer, ); $invindexer->spec_field( name => 'body', stored => $self->{store}, vectorized => $self->{store}, ); $invindexer->spec_field( name => 'title', vectorized => 0, ); return $invindexer;}# Build an index, stopping at $max docs if $max > 0.sub build_index { my $self = shift; $self->delayed_init; my ( $max, $increment, $article_filepaths ) = @{$self}{qw( docs increment article_filepaths )}; # start timer my $start = gettimeofday(); my $invindexer = $self->init_indexer(0); my $count = 0; while ($count < $max) { for my $article_filepath (@$article_filepaths) { # the title is the first line, the body is the rest open( my $article_fh, '<', $article_filepath ) or die "Can't open file '$article_filepath'"; my $title = <$article_fh>; my $body = do { local $/; <$article_fh> }; # add content to index my $doc = $invindexer->new_doc; $doc->set_value( title => $title ); $doc->set_value( body => $body ); $invindexer->add_doc($doc); # bail if we've reached spec'd number of docs $count++; last if $count >= $max; if ( $count % $increment == 0 and $count ) { $invindexer->finish; undef $invindexer; $invindexer = $self->init_indexer($count); } } } # finish index $invindexer->finish( optimize => 1 ); # return elapsed seconds my $end = gettimeofday(); my $secs = $end - $start; return ( $count, $secs );}package BenchmarkingIndexer::Plucene;use strict;use warnings;use base qw( BenchmarkingIndexer );use Time::HiRes qw( gettimeofday );sub new { my $class = shift; my $self = $class->SUPER::new(@_); require Plucene; require Plucene::Document; require Plucene::Document::Field; require Plucene::Index::Writer; require Plucene::Analysis::WhitespaceAnalyzer; $self->{index_dir} = 'plucene_index'; $self->{engine} = 'Plucene'; $self->{version} = $Plucene::VERSION; return $self;}sub init_indexer { my ( $self, $count ) = @_; my $create = $count ? 0 : 1; my $writer = Plucene::Index::Writer->new( $self->{index_dir}, Plucene::Analysis::WhitespaceAnalyzer->new(), $create ); $writer->set_mergefactor(1000); return $writer;}# Build an index, stopping at $max docs if $max > 0.sub build_index { my $self = shift; $self->delayed_init; my ( $max, $increment, $article_filepaths ) = @{$self}{qw( docs increment article_filepaths )}; # cause text to be stored if spec'd my $field_constructor = $self->{store} ? 'Text' : 'UnStored'; # start timer my $start = gettimeofday(); my $writer = $self->init_indexer(0); my $count = 0; while ($count < $max) { for my $article_filepath (@$article_filepaths) { # the title is the first line, the body is the rest open( my $article_fh, '<', $article_filepath ) or die "Can't open file '$article_filepath'"; my $title = <$article_fh>; my $body = do { local $/; <$article_fh> }; # add content to index my $doc = Plucene::Document->new; $doc->add( Plucene::Document::Field->Text( title => $title ) ); $doc->add( Plucene::Document::Field->$field_constructor( body => $body ) ); $writer->add_document($doc); # bail if we've reached spec'd number of docs $count++; last if ( $count >= $max ); if ( $count % $increment == 0 and $count ) { undef $writer; $writer = $self->init_indexer($count); } } } # finish index $writer->optimize; # return elapsed seconds my $end = gettimeofday(); my $secs = $end - $start; return ( $count, $secs );}1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -