⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segtermdocs.pm

📁 外国人写的Perl搜索引擎程序
💻 PM
字号:
package KinoSearch::Index::SegTermDocs;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Index::TermDocs );BEGIN {    __PACKAGE__->init_instance_vars(        # constructor params        reader => undef,    );}our %instance_vars;sub new {    my $self = shift->SUPER::new;    confess kerror() unless verify_args( \%instance_vars, @_ );    my %args = ( %instance_vars, @_ );    my $reader = $args{reader};    _init_child($self);    # dupe some stuff from the parent reader.    $self->_set_reader( $reader );    $self->_set_skip_interval( $reader->get_skip_interval );    $self->_set_freq_stream( $reader->get_freq_stream()->clone_stream );    $self->_set_skip_stream( $reader->get_freq_stream()->clone_stream );    $self->_set_prox_stream( $reader->get_prox_stream()->clone_stream );    $self->_set_deldocs( $reader->get_deldocs );    return $self;}sub seek {    my ( $self, $term ) = @_;    my $tinfo =        defined $term        ? $self->_get_reader()->fetch_term_info($term)        : undef;    $self->seek_tinfo($tinfo);}sub close {    my $self = shift;    $self->_get_freq_stream()->close;    $self->_get_prox_stream()->close;    $self->_get_skip_stream()->close;}1;__END____XS__MODULE = KinoSearch    PACKAGE = KinoSearch::Index::SegTermDocsvoid_init_child(term_docs)    TermDocs *term_docs;PPCODE:    Kino_SegTermDocs_init_child(term_docs);SV*_set_or_get(term_docs, ...)    TermDocs *term_docs;ALIAS:    _set_count         = 1    _get_count         = 2    _set_freq_stream   = 3    _get_freq_stream   = 4    _set_prox_stream   = 5    _get_prox_stream   = 6    _set_skip_stream   = 7    _get_skip_stream   = 8    _set_deldocs       = 9    _get_deldocs       = 10    _set_reader        = 11    _get_reader        = 12    set_read_positions = 13    get_read_positions = 14    _set_skip_interval = 15    _get_skip_interval = 16CODE:{    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;    KINO_START_SET_OR_GET_SWITCH    case 1:  child->count = SvUV(ST(1));             /* fall through */    case 2:  RETVAL = newSVuv(child->count);             break;    case 3:  SvREFCNT_dec(child->freq_stream_sv);             child->freq_stream_sv = newSVsv( ST(1) );             Kino_extract_struct( child->freq_stream_sv, child->freq_stream,                 InStream*, "KinoSearch::Store::InStream");             /* fall through */    case 4:  RETVAL = newSVsv(child->freq_stream_sv);             break;    case 5:  SvREFCNT_dec(child->prox_stream_sv);             child->prox_stream_sv = newSVsv( ST(1) );             Kino_extract_struct( child->prox_stream_sv, child->prox_stream,                 InStream*, "KinoSearch::Store::InStream");             /* fall through */    case 6:  RETVAL = newSVsv(child->prox_stream_sv);             break;    case 7:  SvREFCNT_dec(child->skip_stream_sv);             child->skip_stream_sv = newSVsv( ST(1) );             Kino_extract_struct( child->skip_stream_sv, child->skip_stream,                 InStream*, "KinoSearch::Store::InStream");             /* fall through */    case 8:  RETVAL = newSVsv(child->skip_stream_sv);             break;    case 9:  SvREFCNT_dec(child->deldocs_sv);             child->deldocs_sv = newSVsv( ST(1) );             Kino_extract_struct( child->deldocs_sv, child->deldocs,                 BitVector*, "KinoSearch::Index::DelDocs" );             /* fall through */    case 10: RETVAL = newSVsv(child->deldocs_sv);             break;    case 11: SvREFCNT_dec(child->reader_sv);             if (!sv_derived_from( ST(1), "KinoSearch::Index::IndexReader") )                Kino_confess("not a KinoSearch::Index::IndexReader");             child->reader_sv = newSVsv( ST(1) );             /* fall through */    case 12: RETVAL = newSVsv(child->reader_sv);             break;    case 13: child->read_positions = SvTRUE( ST(1) ) ? 1 : 0;             /* fall through */    case 14: RETVAL = newSViv(child->read_positions);             break;    case 15: child->skip_interval = SvUV(ST(1));             /* fall through */    case 16: RETVAL = newSVuv(child->skip_interval);             break;    KINO_END_SET_OR_GET_SWITCH}OUTPUT: RETVAL__H__#ifndef H_KINO_SEG_TERM_DOCS#define H_KINO_SEG_TERM_DOCS 1#include "EXTERN.h"#include "perl.h"#include "XSUB.h"#include "KinoSearchUtilBitVector.h"#include "KinoSearchIndexTermDocs.h"#include "KinoSearchIndexTermInfo.h"#include "KinoSearchStoreInStream.h"#include "KinoSearchUtilMemManager.h"typedef struct segtermdocschild {    U32        count;    U32        doc_freq;    U32        doc;    U32        freq;    U32        skip_doc;    U32        skip_count;    U32        num_skips;    SV        *positions;    U32        read_positions;    U32        skip_interval;    InStream  *freq_stream;    InStream  *prox_stream;    InStream  *skip_stream;    bool       have_skipped;    double     frq_fileptr;    double     prx_fileptr;    double     skip_fileptr;    BitVector *deldocs;    SV        *freq_stream_sv;    SV        *prox_stream_sv;    SV        *skip_stream_sv;    SV        *deldocs_sv;    SV        *reader_sv;} SegTermDocsChild;void Kino_SegTermDocs_init_child(TermDocs*);void Kino_SegTermDocs_set_doc_freq(TermDocs*, U32);U32  Kino_SegTermDocs_get_doc_freq(TermDocs*);U32  Kino_SegTermDocs_get_doc(TermDocs*);U32  Kino_SegTermDocs_get_freq(TermDocs*);SV*  Kino_SegTermDocs_get_positions(TermDocs*);U32  Kino_SegTermDocs_bulk_read(TermDocs*, SV*, SV*, U32);void Kino_SegTermDocs_seek_tinfo(TermDocs*, TermInfo*);bool Kino_SegTermDocs_next(TermDocs*);bool Kino_SegTermDocs_skip_to(TermDocs*, U32 target);bool Kino_SegTermDocs_skip_to_with_positions(TermDocs*);void Kino_SegTermDocs_destroy(TermDocs*);#endif /* include guard */__C__#include "KinoSearchIndexSegTermDocs.h"static voidload_positions(TermDocs *term_docs);voidKino_SegTermDocs_init_child(TermDocs *term_docs) {    SegTermDocsChild *child;    Kino_New(1, child, 1, SegTermDocsChild);    term_docs->child = child;    child->doc_freq = KINO_TERM_DOCS_SENTINEL;    child->doc      = KINO_TERM_DOCS_SENTINEL;    child->freq     = KINO_TERM_DOCS_SENTINEL;    /* child->positions starts life as an empty string */    child->positions = newSV(1);    SvCUR_set(child->positions, 0);    SvPOK_on(child->positions);    term_docs->set_doc_freq  = Kino_SegTermDocs_set_doc_freq;    term_docs->get_doc_freq  = Kino_SegTermDocs_get_doc_freq;    term_docs->get_doc       = Kino_SegTermDocs_get_doc;    term_docs->get_freq      = Kino_SegTermDocs_get_freq;    term_docs->get_positions = Kino_SegTermDocs_get_positions;    term_docs->bulk_read     = Kino_SegTermDocs_bulk_read;    term_docs->seek_tinfo    = Kino_SegTermDocs_seek_tinfo;    term_docs->next          = Kino_SegTermDocs_next;    term_docs->skip_to       = Kino_SegTermDocs_skip_to;    term_docs->destroy       = Kino_SegTermDocs_destroy;    child->freq_stream_sv   = &PL_sv_undef;    child->prox_stream_sv   = &PL_sv_undef;    child->skip_stream_sv   = &PL_sv_undef;    child->deldocs_sv       = &PL_sv_undef;    child->reader_sv        = &PL_sv_undef;    child->count            = 0;    child->read_positions = 0; /* off by default */}voidKino_SegTermDocs_set_doc_freq(TermDocs *term_docs, U32 doc_freq) {    SegTermDocsChild *child;    child = (SegTermDocsChild*)term_docs->child;    child->doc_freq = doc_freq;}U32Kino_SegTermDocs_get_doc_freq(TermDocs *term_docs) {    SegTermDocsChild *child;    child = (SegTermDocsChild*)term_docs->child;    return child->doc_freq;}U32Kino_SegTermDocs_get_doc(TermDocs *term_docs) {    SegTermDocsChild *child;    child = (SegTermDocsChild*)term_docs->child;    return child->doc;}U32Kino_SegTermDocs_get_freq(TermDocs *term_docs) {    SegTermDocsChild *child;    child = (SegTermDocsChild*)term_docs->child;    return child->freq;}SV*Kino_SegTermDocs_get_positions(TermDocs *term_docs) {    SegTermDocsChild *child;    child = (SegTermDocsChild*)term_docs->child;    return child->positions;}U32 Kino_SegTermDocs_bulk_read(TermDocs *term_docs, SV* doc_nums_sv,                            SV* freqs_sv, U32 num_wanted) {    SegTermDocsChild *child;    InStream         *freq_stream;    U32               doc_code;    U32              *doc_nums;    U32              *freqs;    STRLEN            len;    U32               num_got = 0;    /* local copies */    child       = (SegTermDocsChild*)term_docs->child;    freq_stream = child->freq_stream;    /* allocate space in supplied SVs and make them POK, if necessary */     len = num_wanted * sizeof(U32);    SvUPGRADE(doc_nums_sv, SVt_PV);    SvUPGRADE(freqs_sv,    SVt_PV);    SvPOK_on(doc_nums_sv);    SvPOK_on(freqs_sv);    doc_nums = (U32*)SvGROW(doc_nums_sv, len + 1);    freqs    = (U32*)SvGROW(freqs_sv,    len + 1);    while (child->count < child->doc_freq && num_got < num_wanted) {        /* manually inlined call to term_docs->next */         child->count++;        doc_code = freq_stream->read_vint(freq_stream);;        child->doc  += doc_code >> 1;        if (doc_code & 1)            child->freq = 1;        else            child->freq = freq_stream->read_vint(freq_stream);        /* if the doc isn't deleted... */        if ( !Kino_BitVec_get(child->deldocs, child->doc) ) {            /* ... append to results */            *doc_nums++ = child->doc;            *freqs++    = child->freq;            num_got++;        }    }    /* set the string end to the end of the U32 array */    SvCUR_set(doc_nums_sv, (num_got * sizeof(U32)));    SvCUR_set(freqs_sv,    (num_got * sizeof(U32)));    return num_got;}boolKino_SegTermDocs_next(TermDocs *term_docs) {    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;    InStream         *freq_stream = child->freq_stream;    U32               doc_code;        while (1) {        /* bail if we're out of docs */        if (child->count == child->doc_freq) {            return 0;        }        /* decode delta doc */        doc_code = freq_stream->read_vint(freq_stream);        child->doc  += doc_code >> 1;        /* if the stored num was odd, the freq is 1 */         if (doc_code & 1) {            child->freq = 1;        }        /* otherwise, freq was stored as a VInt. */        else {            child->freq = freq_stream->read_vint(freq_stream);        }         child->count++;                /* read positions if desired */        if (child->read_positions)            load_positions(term_docs);                /* if the doc isn't deleted... success! */        if (!Kino_BitVec_get(child->deldocs, child->doc))            break;    }    return 1;}static voidload_positions(TermDocs *term_docs) {    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;    InStream *prox_stream = child->prox_stream;    STRLEN len = child->freq * sizeof(U32);    U32 *positions, *positions_end;    U32 position = 0;    SvGROW( child->positions, len );    SvCUR_set(child->positions, len);    positions = (U32*)SvPVX(child->positions);    positions_end = (U32*)SvEND(child->positions);    while (positions < positions_end) {        position += prox_stream->read_vint(prox_stream);        *positions++ = position;    }}voidKino_SegTermDocs_seek_tinfo(TermDocs *term_docs, TermInfo *tinfo) {    SegTermDocsChild *child;    child = (SegTermDocsChild*)term_docs->child;    child->count = 0;    if (tinfo == NULL) {        child->doc_freq = 0;    }    else {        child->doc          = 0;        child->freq         = 0;        child->skip_doc     = 0;        child->skip_count   = 0;        child->have_skipped = FALSE;        child->num_skips    = tinfo->doc_freq / child->skip_interval;        child->doc_freq     = tinfo->doc_freq;        child->frq_fileptr  = tinfo->frq_fileptr;        child->prx_fileptr  = tinfo->prx_fileptr;        child->skip_fileptr = tinfo->frq_fileptr + tinfo->skip_offset;        child->freq_stream->seek( child->freq_stream, tinfo->frq_fileptr );        child->prox_stream->seek( child->prox_stream, tinfo->prx_fileptr );    }}boolKino_SegTermDocs_skip_to(TermDocs *term_docs, U32 target) {    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;        if (child->doc_freq >= child->skip_interval) {        InStream *freq_stream   = child->freq_stream;        InStream *prox_stream   = child->prox_stream;        InStream *skip_stream   = child->skip_stream;        U32 last_skip_doc       = child->skip_doc;        double last_frq_fileptr = freq_stream->tell(freq_stream);        double last_prx_fileptr = -1;        I32 num_skipped         = -1 - (child->count % child->skip_interval);        if (!child->have_skipped) {            child->skip_stream->seek(child->skip_stream, child->skip_fileptr);            child->have_skipped = TRUE;        }                while (target > child->skip_doc) {            last_skip_doc    = child->skip_doc;            last_frq_fileptr = child->frq_fileptr;            last_prx_fileptr = child->prx_fileptr;            if (child->skip_doc != 0 && child->skip_doc >= child->doc) {                num_skipped += child->skip_interval;            }            if (child->skip_count >= child->num_skips) {                break;            }            child->skip_doc += skip_stream->read_vint(skip_stream);            child->frq_fileptr += skip_stream->read_vint(skip_stream);            child->prx_fileptr += skip_stream->read_vint(skip_stream);            child->skip_count++;        }        /* if there's something to skip, skip it */        if (last_frq_fileptr > freq_stream->tell(freq_stream)) {            freq_stream->seek(freq_stream, last_frq_fileptr);            if (child->read_positions) {                prox_stream->seek(prox_stream, last_prx_fileptr);            }            child->doc = last_skip_doc;            child->count += num_skipped;        }    }    /* done skipping, so scan */    do {        if (!term_docs->next(term_docs)) {            return FALSE;        }    } while (target > child->doc);    return TRUE;}void Kino_SegTermDocs_destroy(TermDocs *term_docs){    SegTermDocsChild *child;    child = (SegTermDocsChild*)term_docs->child;    SvREFCNT_dec(child->positions);    SvREFCNT_dec(child->freq_stream_sv);    SvREFCNT_dec(child->prox_stream_sv);    SvREFCNT_dec(child->skip_stream_sv);    SvREFCNT_dec(child->deldocs_sv);    SvREFCNT_dec(child->reader_sv);    Kino_Safefree(child);    Kino_TermDocs_destroy(term_docs);}__POD__=begin devdocs=head1 NAMEKinoSearch::Index::SegTermDocs - single-segment TermDocs=head1 DESCRIPTIONSingle-segment implemetation of KinoSearch::Index::TermDocs.=head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.=end devdocs=cut

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -