📄 phrasescorer.pm
字号:
package KinoSearch::Search::PhraseScorer;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Search::Scorer );BEGIN { __PACKAGE__->init_instance_vars( # constructor params weight => undef, term_docs => undef, phrase_offsets => undef, norms_reader => undef, slop => 0, );}our %instance_vars;sub new { my $either = shift; confess kerror() unless verify_args( \%instance_vars, @_ ); my %args = ( %instance_vars, @_ ); my $self = $either->SUPER::new; $self->_init_child; # set/derive some member vars $self->_set_norms( $args{norms_reader}->get_bytes ); $self->set_similarity( $args{similarity} ); $self->_set_weight_value( $args{weight}->get_value ); confess("Sloppy phrase matching not yet implemented") unless $args{slop} == 0; # TODO -- enable slop. $self->_set_slop( $args{slop} ); # sort terms by ascending frequency confess("positions count doesn't match term count") unless $#{ $args{term_docs} } == $#{ $args{phrase_offsets} }; my @by_size = sort { $a->[0]->get_doc_freq <=> $b->[0]->get_doc_freq } map { [ $args{term_docs}[$_], $args{phrase_offsets}[$_] ] } 0 .. $#{ $args{term_docs} }; my @term_docs = map { $_->[0] } @by_size; my @phrase_offsets = map { $_->[1] } @by_size; $self->_init_elements( \@term_docs, \@phrase_offsets ); return $self;}1;__END____XS__MODULE = KinoSearch PACKAGE = KinoSearch::Search::PhraseScorervoid_init_child(scorer) Scorer *scorer;PPCODE: Kino_PhraseScorer_init_child(scorer);void_init_elements(scorer, term_docs_av, phrase_offsets_av) Scorer *scorer; AV *term_docs_av; AV *phrase_offsets_av;PREINIT: PhraseScorerChild *child; I32 i; SV **sv_ptr; IV tmp;PPCODE:{ child = (PhraseScorerChild*)scorer->child; SvREFCNT_inc(term_docs_av); SvREFCNT_dec(child->term_docs_av); child->term_docs_av = term_docs_av; child->num_elements = av_len(term_docs_av) + 1; Kino_New(0, child->term_docs, child->num_elements, TermDocs*); Kino_New(0, child->phrase_offsets, child->num_elements, U32); /* create an array of TermDocs* */ for(i = 0; i < child->num_elements; i++) { sv_ptr = av_fetch(term_docs_av, i, 0); tmp = SvIV((SV*)SvRV( *sv_ptr )); child->term_docs[i] = INT2PTR(TermDocs*, tmp); sv_ptr = av_fetch(phrase_offsets_av, i, 0); child->phrase_offsets[i] = SvIV( *sv_ptr ); }}SV*_phrase_scorer_set_or_get(scorer, ...) Scorer *scorer;ALIAS: _set_slop = 1 _get_slop = 2 _set_weight_value = 3 _get_weight_value = 4 _set_norms = 5 _get_norms = 6CODE:{ PhraseScorerChild *child = (PhraseScorerChild*)scorer->child; KINO_START_SET_OR_GET_SWITCH case 1: child->slop = SvIV( ST(1) ); /* fall through */ case 2: RETVAL = newSViv(child->slop); break; case 3: child->weight_value = SvNV( ST(1) ); /* fall through */ case 4: RETVAL = newSVnv(child->weight_value); break; case 5: SvREFCNT_dec(child->norms_sv); child->norms_sv = newSVsv( ST(1) ); { SV* bytes_deref_sv; bytes_deref_sv = SvRV(child->norms_sv); if (SvPOK(bytes_deref_sv)) { child->norms = (unsigned char*)SvPVX(bytes_deref_sv); } else { child->norms = NULL; } } /* fall through */ case 6: RETVAL = newSVsv(child->norms_sv); break; KINO_END_SET_OR_GET_SWITCH}OUTPUT: RETVALvoidDESTROY(scorer) Scorer *scorer;PPCODE: Kino_PhraseScorer_destroy(scorer);__H__#ifndef H_KINO_PHRASE_SCORER#define H_KINO_PHRASE_SCORER 1#include "EXTERN.h"#include "perl.h"#include "XSUB.h"#include "KinoSearchIndexTermDocs.h"#include "KinoSearchSearchScorer.h"#include "KinoSearchUtilMemManager.h"typedef struct phrasescorerchild { U32 doc; U32 slop; U32 num_elements; TermDocs **term_docs; U32 *phrase_offsets; float phrase_freq; float weight_value; U32 first_time; unsigned char *norms; SV *anchor_set; float (*calc_phrase_freq)(Scorer*); SV *norms_sv; AV *term_docs_av;} PhraseScorerChild;void Kino_PhraseScorer_init_child(Scorer*);bool Kino_PhraseScorer_next(Scorer*);float Kino_PhraseScorer_calc_phrase_freq(Scorer*);U32 Kino_PhraseScorer_doc(Scorer*);float Kino_PhraseScorer_score(Scorer*);void Kino_PhraseScorer_destroy(Scorer*);#endif /* include guard */__C__#include "KinoSearchSearchPhraseScorer.h"voidKino_PhraseScorer_init_child(Scorer *scorer) { PhraseScorerChild *child; /* allocate */ Kino_New(0, child, 1, PhraseScorerChild); scorer->child = child; child->anchor_set = newSV(0); /* init */ child->doc = 0xFFFFFFFF; child->slop = 0; child->first_time = 1; child->phrase_freq = 0.0; child->norms = NULL; child->phrase_offsets = NULL; child->term_docs_av = (AV*)&PL_sv_undef; child->norms_sv = &PL_sv_undef;; /* define abstract methods */ scorer->next = Kino_PhraseScorer_next; scorer->score = Kino_PhraseScorer_score; scorer->doc = Kino_PhraseScorer_doc; child->calc_phrase_freq = Kino_PhraseScorer_calc_phrase_freq;}boolKino_PhraseScorer_next(Scorer *scorer) { PhraseScorerChild *child; TermDocs **term_docs; U32 candidate; U32 i; child = (PhraseScorerChild*)scorer->child; term_docs = child->term_docs; child->phrase_freq = 0.0; child->doc = 0xFFFFFFFF; if (child->first_time) { child->first_time = 0; /* advance all except the first term_docs */ for (i = 1; i < child->num_elements; i++) { if ( !term_docs[i]->next(term_docs[i]) ) return 0; } } /* seed the search */ if ( !term_docs[0]->next(term_docs[0]) ) return 0; candidate = term_docs[0]->get_doc(term_docs[0]); /* find a doc which contains all the terms */ FIND_COMMON_DOC: while (1) { for (i = 0; i < child->num_elements; i++) { U32 thisdoc = term_docs[i]->get_doc(term_docs[i]); if (thisdoc > candidate) candidate = thisdoc; } for (i = 0; i < child->num_elements; i++) { U32 thisdoc = term_docs[i]->get_doc(term_docs[i]); if (thisdoc < candidate) { if (!term_docs[i]->skip_to(term_docs[i], candidate)) return 0; } } for (i = 0; i < child->num_elements; i++) { if (term_docs[i]->get_doc(term_docs[i]) != candidate) { goto FIND_COMMON_DOC; } } break; /* success! */ } /* if the terms don't actually form a phrase, skip to the next doc */ child->phrase_freq = child->calc_phrase_freq(scorer); if (child->phrase_freq == 0.0) return scorer->next(scorer); /* success! */ child->doc = candidate; return 1;}floatKino_PhraseScorer_calc_phrase_freq(Scorer *scorer) { PhraseScorerChild *child; TermDocs **term_docs; U32 *anchors; U32 *anchors_start; U32 *anchors_end; U32 *new_anchors; U32 *candidates; U32 *candidates_end; U32 phrase_offset; U32 i; STRLEN len; child = (PhraseScorerChild*)scorer->child; term_docs = child->term_docs; /* create an anchor set */ sv_setsv( child->anchor_set, term_docs[0]->get_positions(term_docs[0]) ); anchors_start = (U32*)SvPVX(child->anchor_set); anchors = anchors_start; anchors_end = (U32*)SvEND(child->anchor_set); phrase_offset = child->phrase_offsets[0]; while(anchors < anchors_end) { *anchors++ -= phrase_offset; } /* match the positions of other terms against the anchor set */ for (i = 1; i < child->num_elements; i++) { phrase_offset = child->phrase_offsets[i]; anchors = anchors_start; new_anchors = anchors_start; anchors_end = (U32*)SvEND(child->anchor_set); new_anchors = anchors; candidates = (U32*)SvPVX( term_docs[i]->get_positions(term_docs[i]) ); candidates_end = (U32*)SvEND( term_docs[i]->get_positions(term_docs[i]) ); while (anchors < anchors_end) { U32 target; /* Discard positions that occur too early in the field to match as * a part of the phrase. For example, if the field begins with * "The ants go marching one by one", that initial "the" cannot * match as the second term in a phrase search for * "fight the power". */ target = phrase_offset; while (candidates < candidates_end && *candidates < target) { candidates++; } if (candidates == candidates_end) break; /* Discard partial matches which seemed promising earlier but * which fail on this go-round. */ target = *candidates - phrase_offset; while (anchors < anchors_end && *anchors < target) { anchors++; } if (anchors == anchors_end) break; /* Blast past any positions for the current term which are too low * for the partial phrase matched in earlier iters. */ target = *anchors + phrase_offset; while (candidates < candidates_end && *candidates < target) { candidates++; } if (candidates == candidates_end) break; /* Does the current position fall into the slot? */ if (*candidates == target) { /* The anchor has made it through another elimination round. */ *new_anchors = *anchors; new_anchors++; } anchors++; } /* winnow down the size of the anchor set */ len = (char*)new_anchors - (char*)anchors_start; SvCUR_set(child->anchor_set, len); } /* the number of anchors left is the phrase freq */ len = SvCUR(child->anchor_set); return (float) len / sizeof(U32);}U32Kino_PhraseScorer_doc(Scorer *scorer) { PhraseScorerChild* child = (PhraseScorerChild*)scorer->child; return child->doc;}floatKino_PhraseScorer_score(Scorer *scorer) { PhraseScorerChild* child; float score; unsigned char norm; child = (PhraseScorerChild*)scorer->child; /* calculate raw score */ score = scorer->sim->tf(scorer->sim, child->phrase_freq) * child->weight_value; /* normalize */ norm = child->norms[ child->doc ]; score *= scorer->sim->norm_decoder[norm]; return score;}voidKino_PhraseScorer_destroy(Scorer *scorer) { PhraseScorerChild *child; child = (PhraseScorerChild*)scorer->child; Kino_Safefree(child->term_docs); Kino_Safefree(child->phrase_offsets); SvREFCNT_dec(child->norms_sv); SvREFCNT_dec((SV*)child->term_docs_av); SvREFCNT_dec(child->anchor_set); Kino_Safefree(child); Kino_Scorer_destroy(scorer);}__POD__=begin devdocs=head1 NAMEKinoSearch::Search::PhraseScorer - scorer for PhraseQuery=head1 DESCRIPTION Score phrases.=head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.=end devdocs=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -