📄 segwriter.pm
字号:
package KinoSearch::Index::SegWriter;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Util::Class );BEGIN { __PACKAGE__->init_instance_vars( # constructor params / members invindex => undef, seg_name => undef, finfos => undef, field_sims => undef, # members norm_outstreams => undef, fields_writer => undef, postings_writer => undef, doc_count => 0, ); __PACKAGE__->ready_get(qw( seg_name doc_count ));}use KinoSearch::Analysis::TokenBatch;use KinoSearch::Index::FieldsWriter;use KinoSearch::Index::PostingsWriter;use KinoSearch::Index::CompoundFileWriter;use KinoSearch::Index::IndexFileNames qw( @COMPOUND_EXTENSIONS SORTFILE_EXTENSION );sub init_instance { my $self = shift; my ( $invindex, $seg_name, $finfos ) = @{$self}{ 'invindex', 'seg_name', 'finfos' }; # init norms my $norm_outstreams = $self->{norm_outstreams} = []; my @indexed_field_nums = map { $_->get_field_num } grep { $_->get_indexed } $finfos->get_infos; for my $field_num (@indexed_field_nums) { $norm_outstreams->[$field_num] = $invindex->open_outstream("$seg_name.f$field_num"); } # init FieldsWriter $self->{fields_writer} = KinoSearch::Index::FieldsWriter->new( invindex => $invindex, seg_name => $seg_name, ); # init PostingsWriter $self->{postings_writer} = KinoSearch::Index::PostingsWriter->new( invindex => $invindex, seg_name => $seg_name, );}# Add a document to the segment.sub add_doc { my ( $self, $doc ) = @_; my $norm_outstreams = $self->{norm_outstreams}; my $postings_cache = $self->{postings_cache}; my $field_sims = $self->{field_sims}; my $doc_boost = $doc->get_boost; for my $indexed_field ( grep { $_->get_indexed } $doc->get_fields ) { my $field_name = $indexed_field->get_name; my $token_batch = KinoSearch::Analysis::TokenBatch->new; # if the field has content, put it in the TokenBatch if ( $indexed_field->get_value_len ) { $token_batch->append( $indexed_field->get_value, 0, $indexed_field->get_value_len ); } # analyze the field if ( $indexed_field->get_analyzed ) { $token_batch = $indexed_field->get_analyzer()->analyze($token_batch); } # invert the doc $token_batch->build_posting_list( $self->{doc_count}, $indexed_field->get_field_num ); # prepare to store the term vector, if the field is vectorized if ( $indexed_field->get_vectorized and $indexed_field->get_stored ) { $indexed_field->set_tv_string( $token_batch->get_tv_string ); } # encode a norm into a byte, write it to an outstream my $norm_val = $doc_boost * $indexed_field->get_boost * $field_sims->{$field_name} ->lengthnorm( $token_batch->get_size ); my $outstream = $norm_outstreams->[ $indexed_field->get_field_num ]; $outstream->lu_write( 'a', $field_sims->{$field_name}->encode_norm($norm_val) ); # feed PostingsWriter $self->{postings_writer}->add_postings( $token_batch->get_postings ); } # store fields $self->{fields_writer}->add_doc($doc); $self->{doc_count}++;}sub add_segment { my ( $self, $seg_reader ) = @_; # prepare to bulk add my $deldocs = $seg_reader->get_deldocs; my $doc_map = $deldocs->generate_doc_map( $seg_reader->max_doc, $self->{doc_count} ); my $field_num_map = $self->{finfos}->generate_field_num_map( $seg_reader->get_finfos ); # bulk add the slab of documents to the various writers $self->_merge_norms( $seg_reader, $doc_map ); $self->{fields_writer} ->add_segment( $seg_reader, $doc_map, $field_num_map ); $self->{postings_writer}->add_segment( $seg_reader, $doc_map ); $self->{doc_count} += $seg_reader->num_docs;}# Bulk write norms.sub _merge_norms { my ( $self, $seg_reader, $doc_map ) = @_; my $norm_outstreams = $self->{norm_outstreams}; my $field_sims = $self->{field_sims}; my @indexed_fields = grep { $_->get_indexed } $self->{finfos}->get_infos; for my $field (@indexed_fields) { my $field_name = $field->get_name; my $outstream = $norm_outstreams->[ $field->get_field_num ]; my $norms_reader = $seg_reader->norms_reader($field_name); # if the field was indexed before, copy the norms if ( defined $norms_reader ) { _write_remapped_norms( $outstream, $doc_map, $norms_reader->get_bytes ); } else { # the field isn't in the input segment, so write a default my $zeronorm = $field_sims->{$field_name}->lengthnorm(0); my $num_docs = $seg_reader->num_docs; my $normstring = $field_sims->{$field_name}->encode_norm($zeronorm) x $num_docs; $outstream->lu_write( "a$num_docs", $normstring ); } }}# Finish writing the segment.sub finish { my $self = shift; my ( $invindex, $seg_name ) = @{$self}{ 'invindex', 'seg_name' }; # write Term Dictionary, positions. $self->{postings_writer}->write_postings; # write FieldInfos my $finfos_outstream = $invindex->open_outstream("$seg_name.fnm"); $self->{finfos}->write_infos($finfos_outstream); $finfos_outstream->close; # close down all the writers, so we can open the files they've finished. $self->{postings_writer}->finish; $self->{fields_writer}->finish; for ( @{ $self->{norm_outstreams} } ) { $_->close if defined; } # consolidate compound file - if we actually added any docs my @compound_files = map {"$seg_name.$_"} @COMPOUND_EXTENSIONS; if ( $self->{doc_count} ) { my $compound_file_writer = KinoSearch::Index::CompoundFileWriter->new( invindex => $invindex, filename => "$seg_name.tmp", ); push @compound_files, map { "$seg_name.f" . $_->get_field_num } grep { $_->get_indexed } $self->{finfos}->get_infos; $compound_file_writer->add_file($_) for @compound_files; $compound_file_writer->finish; $invindex->rename_file( "$seg_name.tmp", "$seg_name.cfs" ); } # delete files that are no longer needed; $invindex->delete_file($_) for @compound_files; my $sort_file_name = "$seg_name" . SORTFILE_EXTENSION; $invindex->delete_file($sort_file_name) if $invindex->file_exists($sort_file_name);}1;__END____XS__MODULE = KinoSearch PACKAGE = KinoSearch::Index::SegWritervoid_write_remapped_norms(outstream, doc_map_ref, norms_ref) OutStream *outstream; SV *doc_map_ref; SV *norms_ref;PPCODE: Kino_SegWriter_write_remapped_norms(outstream, doc_map_ref, norms_ref);__H__#ifndef H_KINOSEARCH_SEG_WRITER#define H_KINOSEARCH_SEG_WRITER 1#include "EXTERN.h"#include "perl.h"#include "XSUB.h"#include "KinoSearchStoreOutStream.h"#include "KinoSearchUtilCarp.h"void Kino_SegWriter_write_remapped_norms(OutStream*, SV*, SV*);#endif /* include guard */__C__#include "KinoSearchIndexSegWriter.h"void Kino_SegWriter_write_remapped_norms(OutStream *outstream, SV *doc_map_ref, SV* norms_ref) { SV *norms_sv, *doc_map_sv; I32 *doc_map, *doc_map_end; char *norms; STRLEN doc_map_len, norms_len; /* extract doc map and norms arrays */ doc_map_sv = SvRV(doc_map_ref); doc_map = (I32*)SvPV(doc_map_sv, doc_map_len); doc_map_end = (I32*)SvEND(doc_map_sv); norms_sv = SvRV(norms_ref); norms = SvPV(norms_sv, norms_len); if (doc_map_len != norms_len * sizeof(I32)) Kino_confess("Mismatched doc_map and norms"); /* write a norm for each non-deleted doc */ while (doc_map < doc_map_end) { if (*doc_map != -1) { outstream->write_byte(outstream, *norms); } doc_map++; norms++; }}__POD__=begin devdocs=head1 NAMEKinoSearch::Index::SegWriter - write one segment of an invindex=head1 DESCRIPTIONSegWriter is a conduit through which information fed to InvIndexer passes onits way to low-level writers such as FieldsWriter and TermInfosWriter.=head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.=end devdocs=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -