📄 tokenbatch.pm

📁 外国人写的Perl搜索引擎程序
💻 PM
📖 第 1 页 / 共 2 页
字号:
上一页 12
    Token *token = batch->first;    while (token != NULL) {        Token *next = token->next;        Kino_Token_destroy(token);        token = next;    }    SvREFCNT_dec( (SV*)batch->postings );    SvREFCNT_dec(batch->tv_string);    Kino_Safefree(batch);}I32Kino_TokenBatch_next(TokenBatch *batch) {    /* enter iterative mode */    if (batch->initialized == 0) {        batch->current = batch->first;        batch->initialized = 1;    }    /* continue iterative mode */    else {        batch->current = batch->current->next;    }    return batch->current == NULL ? 0 : 1;}voidKino_TokenBatch_reset(TokenBatch *batch) {    batch->initialized = 0;}voidKino_TokenBatch_append(TokenBatch *batch, Token *token) {    token->next  = NULL;    token->prev  = batch->last;    /* if this is the first token added, init */    if (batch->first == NULL) {        batch->first   = token;        batch->last    = token;    }    else {        batch->last->next = token;        batch->last       = token;    }    batch->size++;}#define POSDATA_LEN 12 #define DOC_NUM_LEN 4#define NULL_BYTE_LEN 1#define TEXT_LEN_LEN 2/* Encode postings in the serialized format expected by PostingsWriter, plus  * the term vector expected by FieldsWriter. */voidKino_TokenBatch_build_plist(TokenBatch *batch, U32 doc_num, U16 field_num) {    char     doc_num_buf[4];    char     field_num_buf[2];    char     text_len_buf[2];    char     vint_buf[5];    HV      *pos_hash;    HE      *he;    AV      *out_av;    I32      i = 0;    I32      overlap, num_bytes, num_positions;    I32      num_postings = 0;    SV     **sv_ptr;    char    *text, *source_ptr, *dest_ptr, *end_ptr;    char    *last_text = "";    STRLEN   text_len, len, fake_len;    STRLEN   last_len = 0;    SV      *serialized_sv;    SV      *tv_string_sv;    U32     *source_u32, *dest_u32, *end_u32;    /* prepare doc num and field num in anticipation of upcoming loop */    Kino_encode_bigend_U32(doc_num, doc_num_buf);    Kino_encode_bigend_U16(field_num, field_num_buf);    /* build a posting list hash */    pos_hash = newHV();    while (Kino_TokenBatch_next(batch)) {        Token* token = batch->current;        /* either start a new hash entry or retrieve an existing one */        if (!hv_exists(pos_hash, token->text, token->len)) {            /* the values are the serialized scalars */            if (token->len > 65535)                 Kino_confess("Maximum token length is 65535; got %d",                     token->len);            Kino_encode_bigend_U16(token->len, text_len_buf);            /* allocate the serialized scalar */            len =   TEXT_LEN_LEN       /* for now, put text_len at top */                  + KINO_FIELD_NUM_LEN /* encoded field number */                  + token->len         /* term text */                  + NULL_BYTE_LEN      /* the term text's null byte */                  + DOC_NUM_LEN                   + POSDATA_LEN                  + TEXT_LEN_LEN       /* eventually, text_len goes at end */                  + NULL_BYTE_LEN;     /* the scalar's null byte */             serialized_sv = newSV(len);            SvPOK_on(serialized_sv);            source_ptr = SvPVX(serialized_sv);            dest_ptr   = source_ptr;            /* concatenate a bunch of stuff onto the serialized scalar */            Copy(text_len_buf, dest_ptr, TEXT_LEN_LEN, char);            dest_ptr += TEXT_LEN_LEN;            Copy(field_num_buf, dest_ptr, KINO_FIELD_NUM_LEN, char);            dest_ptr += KINO_FIELD_NUM_LEN;            Copy(token->text, dest_ptr, token->len, char);            dest_ptr += token->len;            *dest_ptr = '\0';            dest_ptr += NULL_BYTE_LEN;            Copy(doc_num_buf, dest_ptr, DOC_NUM_LEN, char);            dest_ptr += DOC_NUM_LEN;            SvCUR_set(serialized_sv, (dest_ptr - source_ptr));             /* store the text => serialized_sv pair in the pos_hash */            (void)hv_store(pos_hash, token->text, token->len, serialized_sv, 0);         }        else {            /* retrieve the serialized scalar and allocate more space */            sv_ptr = hv_fetch(pos_hash, token->text, token->len, 0);            if (sv_ptr == NULL)                 Kino_confess("unexpected null sv_ptr");            serialized_sv = *sv_ptr;            len = SvCUR(serialized_sv)                + POSDATA_LEN    /* allocate space for upcoming posdata */                + TEXT_LEN_LEN   /* extra space for encoded text length */                + NULL_BYTE_LEN;             SvGROW( serialized_sv, len );        }        /* append position, start offset, end offset to the serialized_sv */        dest_u32 = (U32*)SvEND(serialized_sv);        *dest_u32++ = (U32)i;        i += token->pos_inc;        *dest_u32++ = token->start_offset;        *dest_u32++ = token->end_offset;        len = SvCUR(serialized_sv) + POSDATA_LEN;        SvCUR_set(serialized_sv, len);        /* destroy the token, because nobody else will -- XXX MAYBE? */        /* Kino_Token_destroy(token); */    }    /* allocate and presize the array to hold the output */    num_postings = hv_iterinit(pos_hash);    out_av = newAV();    av_extend(out_av, num_postings);    /* collect serialized scalars into an array */    i = 0;    while ((he = hv_iternext(pos_hash))) {        serialized_sv = HeVAL(he);        /* transfer text_len to end of serialized scalar */        source_ptr = SvPVX(serialized_sv);        dest_ptr   = SvEND(serialized_sv);        Copy(source_ptr, dest_ptr, TEXT_LEN_LEN, char);        SvCUR(serialized_sv) += TEXT_LEN_LEN;        source_ptr += TEXT_LEN_LEN;        sv_chop(serialized_sv, source_ptr);        SvREFCNT_inc(serialized_sv);        av_store(out_av, i, serialized_sv);        i++;    }    /* we're done with the positions hash, so kill it off */    SvREFCNT_dec(pos_hash);    /* start the term vector string */    tv_string_sv = newSV(20);    SvPOK_on(tv_string_sv);    num_bytes = Kino_OutStream_encode_vint(num_postings, vint_buf);    sv_catpvn(tv_string_sv, vint_buf, num_bytes);    /* sort the posting lists lexically */    sortsv(AvARRAY(out_av), num_postings, Perl_sv_cmp);    /* iterate through the array, making changes to the serialized scalars */    for (i = 0; i < num_postings; i++) {        serialized_sv = *(av_fetch(out_av, i, 0));        /* find the beginning of the term text */        text = SvPV(serialized_sv, fake_len);        text += KINO_FIELD_NUM_LEN;        /* save the text_len; we'll move it forward later */        end_ptr = SvEND(serialized_sv) - TEXT_LEN_LEN;        text_len = Kino_decode_bigend_U16( end_ptr );        Kino_encode_bigend_U16(text_len, text_len_buf);        source_ptr = SvPVX(serialized_sv) +             KINO_FIELD_NUM_LEN + text_len + NULL_BYTE_LEN + DOC_NUM_LEN;        source_u32 = (U32*)source_ptr;        dest_u32   = source_u32;        end_u32    = (U32*)end_ptr;        /* append the string diff to the tv_string */        overlap = Kino_StrHelp_string_diff(last_text, text,             last_len, text_len);        num_bytes = Kino_OutStream_encode_vint(overlap, vint_buf);        sv_catpvn( tv_string_sv, vint_buf, num_bytes );        num_bytes = Kino_OutStream_encode_vint(            (text_len - overlap), vint_buf );        sv_catpvn( tv_string_sv, vint_buf, num_bytes );        sv_catpvn( tv_string_sv, (text + overlap), (text_len - overlap) );        /* append the number of positions for this term */        num_positions =   SvCUR(serialized_sv)                         - KINO_FIELD_NUM_LEN                        - text_len                         - NULL_BYTE_LEN                        - DOC_NUM_LEN                         - TEXT_LEN_LEN;        num_positions /= POSDATA_LEN;        num_bytes = Kino_OutStream_encode_vint(num_positions, vint_buf);        sv_catpvn( tv_string_sv, vint_buf, num_bytes );        while (source_u32 < end_u32) {            /* keep only the positions in the serialized scalars */            num_bytes = Kino_OutStream_encode_vint(*source_u32, vint_buf);            sv_catpvn( tv_string_sv, vint_buf, num_bytes );            *dest_u32++ = *source_u32++;            /* add start_offset to tv_string */            num_bytes = Kino_OutStream_encode_vint(*source_u32, vint_buf);            sv_catpvn( tv_string_sv, vint_buf, num_bytes );            source_u32++;            /* add end_offset to tv_string */            num_bytes = Kino_OutStream_encode_vint(*source_u32, vint_buf);            sv_catpvn( tv_string_sv, vint_buf, num_bytes );            source_u32++;        }        /* restore the text_len and close the scalar */        dest_ptr = (char*)dest_u32;        Copy(text_len_buf, dest_ptr, TEXT_LEN_LEN, char);        dest_ptr += TEXT_LEN_LEN;        len = dest_ptr - SvPVX(serialized_sv);        SvCUR_set(serialized_sv, len);        last_text = text;        last_len  = text_len;    }        /* store the postings array and the term vector string */    SvREFCNT_dec(batch->tv_string);    batch->tv_string = tv_string_sv;    SvREFCNT_dec(batch->postings);    batch->postings = out_av;}__POD__=head1 NAMEKinoSearch::Analysis::TokenBatch - a collection of tokens=head1 SYNOPSIS    while ( $batch->next ) {        $batch->set_text( lc( $batch->get_text ) );    }=head1 EXPERIMENTAL API TokenBatch's API should be considered experimental and is likely to change.=head1 DESCRIPTIONA TokenBatch is a collection of L<Tokens|KinoSearch::Analysis::Token> whichyou can add to, then iterate over.  =head1 METHODS=head2 new    my $batch = KinoSearch::Analysis::TokenBatch->new;Constructor.=head2 append     $batch->append( $text, $start_offset, $end_offset, $pos_inc );Add a Token to the end of the batch.  Accepts either three or four arguments:text, start_offset, end_offset, and an optional position increment whichdefaults to 1 if not supplied.  For a description of what these argumentsmean, see the docs for L<Token|KinoSearch::Analysis::Token>.=head2 next    while ( $batch->next ) {        # ...    }Proceed to the next token in the TokenBatch.  Returns true if the TokenBatchends up located at valid token.=head1 ACCESSOR METHODSAll of TokenBatch's accessor methods affect the current Token.  Calling any ofthese methods when the TokenBatch is not located at a valid Token will triggeran exception.=head2 set_text get_text Set/get the text of the current Token.=head2 set_start_offset get_start_offsetSet/get the start_offset of the current Token.=head2 set_end_offset get_end_offsetSet/get the end_offset of the current Token.=head2 set_pos_inc get_pos_incSet/get the position increment of the current Token.=head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.=cut
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -