📄 invindexer.pm
字号:
package KinoSearch::InvIndexer;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Util::Class );use constant UNINITIALIZED => 0;use constant INITIALIZED => 1;use constant FINISHED => 2;BEGIN { __PACKAGE__->init_instance_vars( # constructor args / members create => undef, invindex => undef, analyzer => undef, # members reader => undef, analyzers => undef, sinfos => undef, finfos => undef, doc_template => undef, frozen_doc => undef, similarity => undef, field_sims => undef, seg_writer => undef, write_lock => undef, state => UNINITIALIZED, );}use Storable qw( freeze thaw );use File::Spec::Functions qw( catfile tmpdir );use KinoSearch::Document::Doc;use KinoSearch::Document::Field;use KinoSearch::Analysis::Analyzer;use KinoSearch::Store::FSInvIndex;use KinoSearch::Index::FieldInfos;use KinoSearch::Index::FieldsReader;use KinoSearch::Index::IndexReader;use KinoSearch::Index::SegInfos;use KinoSearch::Index::SegWriter;use KinoSearch::Index::IndexFileNames qw( WRITE_LOCK_NAME COMMIT_LOCK_NAME WRITE_LOCK_TIMEOUT COMMIT_LOCK_TIMEOUT );use KinoSearch::Search::Similarity;sub init_instance { my $self = shift; $self->{analyzers} = {}; $self->{field_sims} = {}; # use a no-op Analyzer if not supplied $self->{analyzer} ||= KinoSearch::Analysis::Analyzer->new; # create a few members $self->{similarity} = KinoSearch::Search::Similarity->new; $self->{sinfos} = KinoSearch::Index::SegInfos->new; $self->{doc_template} = KinoSearch::Document::Doc->new; # confirm or create an InvIndex object my $invindex; if ( blessed( $self->{invindex} ) and $self->{invindex}->isa('KinoSearch::Store::InvIndex') ) { $invindex = $self->{invindex}; $self->{create} = $invindex->get_create unless defined $self->{create}; } elsif ( defined $self->{invindex} ) { $invindex = $self->{invindex} = KinoSearch::Store::FSInvIndex->new( create => $self->{create}, path => $self->{invindex}, ); } else { croak("Required parameter 'invindex' not supplied"); } # get a write lock for this invindex. my $write_lock = $invindex->make_lock( lock_name => WRITE_LOCK_NAME, timeout => WRITE_LOCK_TIMEOUT, ); if ( $write_lock->obtain ) { # only assign if successful, otherwise DESTROY unlocks (bad!) $self->{write_lock} = $write_lock; } else { croak( "invindex locked: " . $write_lock->get_lock_name ); } # read/write SegInfos eval { $invindex->run_while_locked( lock_name => COMMIT_LOCK_NAME, timeout => COMMIT_LOCK_TIMEOUT, do_body => sub { $self->{create} ? $self->{sinfos}->write_infos($invindex) : $self->{sinfos}->read_infos($invindex); }, ); }; if ($@) { $self->{create} ? croak("failed to create invindex: $@") : croak("failed to open existing invindex: $@"); } # get a finfos and maybe a reader if ( $self->{create} ) { $self->{finfos} = KinoSearch::Index::FieldInfos->new; } else { $self->{reader} = KinoSearch::Index::IndexReader->new( invindex => $invindex ); $self->{finfos} = $self->{reader}->generate_field_infos; } # more initialization is coming after fields are spec'd...}sub _delayed_init { my $self = shift; my ( $invindex, $finfos, $field_sims ) = @{$self}{qw( invindex finfos field_sims )}; confess("finish has been called") if $self->{state} == FINISHED; confess("internal error: already initialized") if $self->{state} == INITIALIZED; $self->{state} = INITIALIZED; # create a cloning template my $doc = $self->{doc_template}; for my $field ( $doc->get_fields ) { $field->set_field_num( $finfos->get_field_num( $field->get_name ) ); } $self->{frozen_doc} = freeze($doc); # set sim for each field my $main_sim = $self->{similarity}; for my $finfo ( $finfos->get_infos ) { $field_sims->{ $finfo->get_name } ||= $main_sim; } # name a new segment and create a SegWriter my $out_seg_name = $self->_new_seg_name; $self->{seg_writer} = KinoSearch::Index::SegWriter->new( invindex => $invindex, seg_name => $out_seg_name, finfos => $finfos->clone, field_sims => $field_sims, );}sub spec_field { my $self = shift; # don't allow new fields to be spec'd once the seg is in motion croak("Too late to spec field (new_doc has been called)") unless $self->{state} == UNINITIALIZED; # detect or define a Field object my $field; if ( blessed( $_[0] ) ) { $field = shift; } else { eval { $field = KinoSearch::Document::Field->new(@_) }; croak $@ if $@; } # cache fnm_bits and fdt_bits $field->set_fnm_bits( KinoSearch::Index::FieldInfos->encode_fnm_bits($field) ); $field->set_fdt_bits( KinoSearch::Index::FieldsReader->encode_fdt_bits($field) ); # establish which analyzer will be used against the field $self->{analyzers}{ $field->get_name } = ( $field->get_analyzer || $self->{analyzer} ); # don't copy the analyzer into the template, so that it can be overridden $field->set_analyzer(undef); # add the field to the finfos and the template. $self->{finfos}->add_field($field); $self->{doc_template}->add_field($field);}sub new_doc { my $self = shift; $self->_delayed_init unless $self->{state} == INITIALIZED; return thaw( $self->{frozen_doc} );}sub set_similarity { if ( @_ == 3 ) { my ( $self, $field_name, $sim ) = @_; $self->{field_sims}{$field_name} = $sim; } else { $_[0]->{similarity} = $_[1]; }}sub add_doc { my ( $self, $doc ) = @_; # assign analyzers for my $field ( $doc->get_fields ) { if ( $field->get_analyzed ) { next if $field->get_analyzer; my $fieldname = $field->get_name; $field->set_analyzer( $self->{analyzers}{$fieldname} ); } } # add doc to output segment $self->{seg_writer}->add_doc($doc);}sub add_invindexes { my ( $self, @invindexes ) = @_; confess("Can't call add_invindexes after new_doc") if $self->{state} == INITIALIZED; # verify or obtain InvIndex objects for (@invindexes) { if ( !a_isa_b( $_, 'KinoSearch::Store::InvIndex' ) ) { $_ = KinoSearch::Store::FSInvIndex->new( path => $_ ); } } # get a reader for each invindex my @readers = map { KinoSearch::Index::IndexReader->new( invindex => $_ ) } @invindexes; # merge finfos and init for my $reader (@readers) { $self->{finfos}->consolidate( $reader->get_finfos ); } $self->_delayed_init; # add all segments in each of the supplied invindexes my $seg_writer = $self->{seg_writer}; for my $reader (@readers) { $seg_writer->add_segment($_) for $reader->segreaders_to_merge('all'); }}sub delete_docs_by_term { my ( $self, $term ) = @_; confess("Not a KinoSearch::Index::Term") unless a_isa_b( $term, 'KinoSearch::Index::Term' ); return unless $self->{reader}; $self->_delayed_init unless $self->{state} == INITIALIZED; $self->{reader}->delete_docs_by_term($term);}our %finish_defaults = ( optimize => 0, );sub finish { my $self = shift; confess kerror() unless verify_args( \%finish_defaults, @_ ); my %args = ( %finish_defaults, @_ ); # if no changes were made to the index, don't write anything if ( $self->{state} == UNINITIALIZED ) { if ( !$args{optimize} ) { return; } else { $self->_delayed_init; } } my ( $invindex, $sinfos, $seg_writer ) = @{$self}{qw( invindex sinfos seg_writer )}; # perform segment merging my @to_merge = $self->{reader} ? $self->{reader}->segreaders_to_merge( $args{optimize} ) : (); $seg_writer->add_segment($_) for @to_merge; $sinfos->delete_segment( $_->get_seg_name ) for @to_merge; # finish the segment $seg_writer->finish; # now that the seg is complete, write its info to the 'segments' file my $doc_count = $seg_writer->get_doc_count; if ($doc_count) { $sinfos->add_info( KinoSearch::Index::SegInfo->new( seg_name => $seg_writer->get_seg_name, doc_count => $doc_count, invindex => $invindex, ) ); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -