📄 documentwriter.pm
字号:
package Plucene::Index::DocumentWriter;=head1 NAME Plucene::Index::DocumentWriter - the document writer=head1 SYNOPSIS my $writer = Plucene::Index::DocumentWriter ->new($directory, $analyser, $max_field_length); $writer->add_document($segment, $doc);=head1 DESCRIPTIONThis is the document writer class.=head2 METHODS=cutuse strict;use warnings;use File::Slurp;use Plucene::Index::FieldInfos;use Plucene::Index::FieldsWriter;use Plucene::Index::Term;use Plucene::Index::TermInfo;use Plucene::Index::TermInfosWriter;use Plucene::Search::Similarity;use Plucene::Store::OutputStream;use IO::Scalar;=head2 new my $writer = Plucene::Index::DocumentWriter ->new($directory, $analyser, $max_field_length);This will create a new Plucene::Index::DocumentWriter object with the passedin arguments.=cutsub new { my ($self, $d, $a, $mfl) = @_; bless { directory => $d, analyzer => $a, max_field_length => $mfl, postings => {}, }, $self;}=head2 add_document $writer->add_document($segment, $doc);=cutsub add_document { my ($self, $segment, $doc) = @_; my $fi = Plucene::Index::FieldInfos->new(); $fi->add($doc); $fi->write("$self->{directory}/$segment.fnm"); $self->{field_infos} = $fi; my $fw = Plucene::Index::FieldsWriter->new($self->{directory}, $segment, $fi); $fw->add_document($doc); $self->{postings} = {}; $self->{field_lengths} = []; $self->_invert_document($doc); my @postings = sort { $a->{term}->{field} cmp $b->{term}->{field} || $a->{term}->{text} cmp $b->{term}->{text} } values %{ $self->{postings} }; $self->_write_postings($segment, @postings); $self->_write_norms($doc, $segment);}sub _invert_document { my ($self, $doc) = @_; for my $field (grep $_->is_indexed, $doc->fields) { my $name = $field->name; my $fn = $self->{field_infos}->field_number($name); my $pos = $self->{field_lengths}->[$fn]; if (!$field->is_tokenized) { $self->_add_position($name, $field->string, $pos++); } else { my $reader = $field->reader || IO::Scalar->new(\$field->{string}); my $stream = $self->{analyzer}->tokenstream({ field => $name, reader => $reader }); while (my $t = $stream->next) { $self->_add_position($name, $t->text, $pos++); last if $pos > $self->{max_field_length}; } } $self->{field_lengths}->[$fn] = $pos; }}sub _add_position { my ($self, $field, $text, $pos) = @_; my $ti = $self->{postings}->{"$field\0$text"}; if ($ti) { $ti->{positions}->[ $ti->freq ] = $pos; $ti->{freq}++; return; } $self->{postings}->{"$field\0$text"} = Plucene::Index::Posting->new({ term => Plucene::Index::Term->new({ field => $field, text => $text }), positions => [$pos], freq => 1, });}sub _write_postings { my ($self, $segment, @postings) = @_; my (@freqs, @proxs); my $tis = Plucene::Index::TermInfosWriter->new($self->{directory}, $segment, $self->{field_infos}); my $ti = Plucene::Index::TermInfo->new(); for my $posting (@postings) { $ti->doc_freq(1); $ti->freq_pointer(scalar @freqs); $ti->prox_pointer(scalar @proxs); $tis->add($posting->term, $ti); my $f = $posting->freq; push @freqs, ($f == 1) ? 1 : (0, $f); my $last_pos = 0; my $positions = $posting->positions; for my $j (0 .. $f - 1) { my $pos = $positions->[$j] || 0; push @proxs, $pos - $last_pos; $last_pos = $pos; } } write_file("$self->{directory}/$segment.frq" => pack('(w)*', @freqs)); write_file("$self->{directory}/$segment.prx" => pack('(w)*', @proxs)); $tis->break_ref;}sub _write_norms { my ($self, $doc, $segment) = @_; for my $field (grep $_->is_indexed, $doc->fields) { my $fn = $self->{field_infos}->field_number($field->name); warn "Couldn't find field @{[ $field->name ]} in list [ @{[ map $_->name, $self->{field_infos}->fields]}]" unless $fn >= 0; my $norm = Plucene::Store::OutputStream->new("$self->{directory}/$segment.f$fn"); my $val = $self->{field_lengths}[$fn]; my $norm_val = Plucene::Search::Similarity->norm($val); $norm->print(chr($norm_val)); }}package Plucene::Index::Posting;use base 'Class::Accessor::Fast';__PACKAGE__->mk_accessors(qw( term freq positions ));1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -