⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 invindexer.pm

📁 外国人写的Perl搜索引擎程序
💻 PM
📖 第 1 页 / 共 2 页
字号:
package KinoSearch::InvIndexer;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Util::Class );use constant UNINITIALIZED => 0;use constant INITIALIZED   => 1;use constant FINISHED      => 2;BEGIN {    __PACKAGE__->init_instance_vars(        # constructor args / members        create   => undef,        invindex => undef,        analyzer => undef,        # members        reader       => undef,        analyzers    => undef,        sinfos       => undef,        finfos       => undef,        doc_template => undef,        frozen_doc   => undef,        similarity   => undef,        field_sims   => undef,        seg_writer   => undef,        write_lock   => undef,        state        => UNINITIALIZED,    );}use Storable qw( freeze thaw );use File::Spec::Functions qw( catfile tmpdir );use KinoSearch::Document::Doc;use KinoSearch::Document::Field;use KinoSearch::Analysis::Analyzer;use KinoSearch::Store::FSInvIndex;use KinoSearch::Index::FieldInfos;use KinoSearch::Index::FieldsReader;use KinoSearch::Index::IndexReader;use KinoSearch::Index::SegInfos;use KinoSearch::Index::SegWriter;use KinoSearch::Index::IndexFileNames    qw( WRITE_LOCK_NAME    COMMIT_LOCK_NAME    WRITE_LOCK_TIMEOUT COMMIT_LOCK_TIMEOUT );use KinoSearch::Search::Similarity;sub init_instance {    my $self = shift;    $self->{analyzers}  = {};    $self->{field_sims} = {};    # use a no-op Analyzer if not supplied    $self->{analyzer} ||= KinoSearch::Analysis::Analyzer->new;    # create a few members    $self->{similarity}   = KinoSearch::Search::Similarity->new;    $self->{sinfos}       = KinoSearch::Index::SegInfos->new;    $self->{doc_template} = KinoSearch::Document::Doc->new;    # confirm or create an InvIndex object    my $invindex;    if ( blessed( $self->{invindex} )        and $self->{invindex}->isa('KinoSearch::Store::InvIndex') )    {        $invindex = $self->{invindex};        $self->{create} = $invindex->get_create            unless defined $self->{create};    }    elsif ( defined $self->{invindex} ) {        $invindex = $self->{invindex} = KinoSearch::Store::FSInvIndex->new(            create => $self->{create},            path   => $self->{invindex},        );    }    else {        croak("Required parameter 'invindex' not supplied");    }    # get a write lock for this invindex.    my $write_lock = $invindex->make_lock(        lock_name => WRITE_LOCK_NAME,        timeout   => WRITE_LOCK_TIMEOUT,    );    if ( $write_lock->obtain ) {        # only assign if successful, otherwise DESTROY unlocks (bad!)        $self->{write_lock} = $write_lock;    }    else {        croak( "invindex locked: " . $write_lock->get_lock_name );    }    # read/write SegInfos    eval {        $invindex->run_while_locked(            lock_name => COMMIT_LOCK_NAME,            timeout   => COMMIT_LOCK_TIMEOUT,            do_body   => sub {                $self->{create}                    ? $self->{sinfos}->write_infos($invindex)                    : $self->{sinfos}->read_infos($invindex);            },        );    };    if ($@) {        $self->{create}            ? croak("failed to create invindex: $@")            : croak("failed to open existing invindex: $@");    }    # get a finfos and maybe a reader    if ( $self->{create} ) {        $self->{finfos} = KinoSearch::Index::FieldInfos->new;    }    else {        $self->{reader}            = KinoSearch::Index::IndexReader->new( invindex => $invindex );        $self->{finfos} = $self->{reader}->generate_field_infos;    }    # more initialization is coming after fields are spec'd...}sub _delayed_init {    my $self = shift;    my ( $invindex, $finfos, $field_sims )        = @{$self}{qw( invindex finfos field_sims )};    confess("finish has been called")        if $self->{state} == FINISHED;    confess("internal error: already initialized")        if $self->{state} == INITIALIZED;    $self->{state} = INITIALIZED;    # create a cloning template    my $doc = $self->{doc_template};    for my $field ( $doc->get_fields ) {        $field->set_field_num( $finfos->get_field_num( $field->get_name ) );    }    $self->{frozen_doc} = freeze($doc);    # set sim for each field    my $main_sim = $self->{similarity};    for my $finfo ( $finfos->get_infos ) {        $field_sims->{ $finfo->get_name } ||= $main_sim;    }    # name a new segment and create a SegWriter    my $out_seg_name = $self->_new_seg_name;    $self->{seg_writer} = KinoSearch::Index::SegWriter->new(        invindex   => $invindex,        seg_name   => $out_seg_name,        finfos     => $finfos->clone,        field_sims => $field_sims,    );}sub spec_field {    my $self = shift;    # don't allow new fields to be spec'd once the seg is in motion    croak("Too late to spec field (new_doc has been called)")        unless $self->{state} == UNINITIALIZED;    # detect or define a Field object    my $field;    if ( blessed( $_[0] ) ) {        $field = shift;    }    else {        eval { $field = KinoSearch::Document::Field->new(@_) };        croak $@ if $@;    }    # cache fnm_bits and fdt_bits    $field->set_fnm_bits(        KinoSearch::Index::FieldInfos->encode_fnm_bits($field) );    $field->set_fdt_bits(        KinoSearch::Index::FieldsReader->encode_fdt_bits($field) );    # establish which analyzer will be used against the field    $self->{analyzers}{ $field->get_name }        = ( $field->get_analyzer || $self->{analyzer} );    # don't copy the analyzer into the template, so that it can be overridden    $field->set_analyzer(undef);    # add the field to the finfos and the template.    $self->{finfos}->add_field($field);    $self->{doc_template}->add_field($field);}sub new_doc {    my $self = shift;    $self->_delayed_init unless $self->{state} == INITIALIZED;    return thaw( $self->{frozen_doc} );}sub set_similarity {    if ( @_ == 3 ) {        my ( $self, $field_name, $sim ) = @_;        $self->{field_sims}{$field_name} = $sim;    }    else {        $_[0]->{similarity} = $_[1];    }}sub add_doc {    my ( $self, $doc ) = @_;    # assign analyzers    for my $field ( $doc->get_fields ) {        if ( $field->get_analyzed ) {            next if $field->get_analyzer;            my $fieldname = $field->get_name;            $field->set_analyzer( $self->{analyzers}{$fieldname} );        }    }    # add doc to output segment    $self->{seg_writer}->add_doc($doc);}sub add_invindexes {    my ( $self, @invindexes ) = @_;    confess("Can't call add_invindexes after new_doc")        if $self->{state} == INITIALIZED;    # verify or obtain InvIndex objects    for (@invindexes) {        if ( !a_isa_b( $_, 'KinoSearch::Store::InvIndex' ) ) {            $_ = KinoSearch::Store::FSInvIndex->new( path => $_ );        }    }    # get a reader for each invindex    my @readers        = map { KinoSearch::Index::IndexReader->new( invindex => $_ ) }        @invindexes;    # merge finfos and init    for my $reader (@readers) {        $self->{finfos}->consolidate( $reader->get_finfos );    }    $self->_delayed_init;    # add all segments in each of the supplied invindexes    my $seg_writer = $self->{seg_writer};    for my $reader (@readers) {        $seg_writer->add_segment($_) for $reader->segreaders_to_merge('all');    }}sub delete_docs_by_term {    my ( $self, $term ) = @_;    confess("Not a KinoSearch::Index::Term")        unless a_isa_b( $term, 'KinoSearch::Index::Term' );    return               unless $self->{reader};    $self->_delayed_init unless $self->{state} == INITIALIZED;    $self->{reader}->delete_docs_by_term($term);}our %finish_defaults = ( optimize => 0, );sub finish {    my $self = shift;    confess kerror() unless verify_args( \%finish_defaults, @_ );    my %args = ( %finish_defaults, @_ );    # if no changes were made to the index, don't write anything    if ( $self->{state} == UNINITIALIZED ) {        if ( !$args{optimize} ) {            return;        }        else {            $self->_delayed_init;        }    }    my ( $invindex, $sinfos, $seg_writer )        = @{$self}{qw( invindex sinfos seg_writer )};    # perform segment merging    my @to_merge =          $self->{reader}        ? $self->{reader}->segreaders_to_merge( $args{optimize} )        : ();    $seg_writer->add_segment($_)                for @to_merge;    $sinfos->delete_segment( $_->get_seg_name ) for @to_merge;    # finish the segment    $seg_writer->finish;    # now that the seg is complete, write its info to the 'segments' file    my $doc_count = $seg_writer->get_doc_count;    if ($doc_count) {        $sinfos->add_info(            KinoSearch::Index::SegInfo->new(                seg_name  => $seg_writer->get_seg_name,                doc_count => $doc_count,                invindex  => $invindex,            )        );    }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -