📄 tokenbatch.pm
字号:
Token *token = batch->first; while (token != NULL) { Token *next = token->next; Kino_Token_destroy(token); token = next; } SvREFCNT_dec( (SV*)batch->postings ); SvREFCNT_dec(batch->tv_string); Kino_Safefree(batch);}I32Kino_TokenBatch_next(TokenBatch *batch) { /* enter iterative mode */ if (batch->initialized == 0) { batch->current = batch->first; batch->initialized = 1; } /* continue iterative mode */ else { batch->current = batch->current->next; } return batch->current == NULL ? 0 : 1;}voidKino_TokenBatch_reset(TokenBatch *batch) { batch->initialized = 0;}voidKino_TokenBatch_append(TokenBatch *batch, Token *token) { token->next = NULL; token->prev = batch->last; /* if this is the first token added, init */ if (batch->first == NULL) { batch->first = token; batch->last = token; } else { batch->last->next = token; batch->last = token; } batch->size++;}#define POSDATA_LEN 12 #define DOC_NUM_LEN 4#define NULL_BYTE_LEN 1#define TEXT_LEN_LEN 2/* Encode postings in the serialized format expected by PostingsWriter, plus * the term vector expected by FieldsWriter. */voidKino_TokenBatch_build_plist(TokenBatch *batch, U32 doc_num, U16 field_num) { char doc_num_buf[4]; char field_num_buf[2]; char text_len_buf[2]; char vint_buf[5]; HV *pos_hash; HE *he; AV *out_av; I32 i = 0; I32 overlap, num_bytes, num_positions; I32 num_postings = 0; SV **sv_ptr; char *text, *source_ptr, *dest_ptr, *end_ptr; char *last_text = ""; STRLEN text_len, len, fake_len; STRLEN last_len = 0; SV *serialized_sv; SV *tv_string_sv; U32 *source_u32, *dest_u32, *end_u32; /* prepare doc num and field num in anticipation of upcoming loop */ Kino_encode_bigend_U32(doc_num, doc_num_buf); Kino_encode_bigend_U16(field_num, field_num_buf); /* build a posting list hash */ pos_hash = newHV(); while (Kino_TokenBatch_next(batch)) { Token* token = batch->current; /* either start a new hash entry or retrieve an existing one */ if (!hv_exists(pos_hash, token->text, token->len)) { /* the values are the serialized scalars */ if (token->len > 65535) Kino_confess("Maximum token length is 65535; got %d", token->len); Kino_encode_bigend_U16(token->len, text_len_buf); /* allocate the serialized scalar */ len = TEXT_LEN_LEN /* for now, put text_len at top */ + KINO_FIELD_NUM_LEN /* encoded field number */ + token->len /* term text */ + NULL_BYTE_LEN /* the term text's null byte */ + DOC_NUM_LEN + POSDATA_LEN + TEXT_LEN_LEN /* eventually, text_len goes at end */ + NULL_BYTE_LEN; /* the scalar's null byte */ serialized_sv = newSV(len); SvPOK_on(serialized_sv); source_ptr = SvPVX(serialized_sv); dest_ptr = source_ptr; /* concatenate a bunch of stuff onto the serialized scalar */ Copy(text_len_buf, dest_ptr, TEXT_LEN_LEN, char); dest_ptr += TEXT_LEN_LEN; Copy(field_num_buf, dest_ptr, KINO_FIELD_NUM_LEN, char); dest_ptr += KINO_FIELD_NUM_LEN; Copy(token->text, dest_ptr, token->len, char); dest_ptr += token->len; *dest_ptr = '\0'; dest_ptr += NULL_BYTE_LEN; Copy(doc_num_buf, dest_ptr, DOC_NUM_LEN, char); dest_ptr += DOC_NUM_LEN; SvCUR_set(serialized_sv, (dest_ptr - source_ptr)); /* store the text => serialized_sv pair in the pos_hash */ (void)hv_store(pos_hash, token->text, token->len, serialized_sv, 0); } else { /* retrieve the serialized scalar and allocate more space */ sv_ptr = hv_fetch(pos_hash, token->text, token->len, 0); if (sv_ptr == NULL) Kino_confess("unexpected null sv_ptr"); serialized_sv = *sv_ptr; len = SvCUR(serialized_sv) + POSDATA_LEN /* allocate space for upcoming posdata */ + TEXT_LEN_LEN /* extra space for encoded text length */ + NULL_BYTE_LEN; SvGROW( serialized_sv, len ); } /* append position, start offset, end offset to the serialized_sv */ dest_u32 = (U32*)SvEND(serialized_sv); *dest_u32++ = (U32)i; i += token->pos_inc; *dest_u32++ = token->start_offset; *dest_u32++ = token->end_offset; len = SvCUR(serialized_sv) + POSDATA_LEN; SvCUR_set(serialized_sv, len); /* destroy the token, because nobody else will -- XXX MAYBE? */ /* Kino_Token_destroy(token); */ } /* allocate and presize the array to hold the output */ num_postings = hv_iterinit(pos_hash); out_av = newAV(); av_extend(out_av, num_postings); /* collect serialized scalars into an array */ i = 0; while ((he = hv_iternext(pos_hash))) { serialized_sv = HeVAL(he); /* transfer text_len to end of serialized scalar */ source_ptr = SvPVX(serialized_sv); dest_ptr = SvEND(serialized_sv); Copy(source_ptr, dest_ptr, TEXT_LEN_LEN, char); SvCUR(serialized_sv) += TEXT_LEN_LEN; source_ptr += TEXT_LEN_LEN; sv_chop(serialized_sv, source_ptr); SvREFCNT_inc(serialized_sv); av_store(out_av, i, serialized_sv); i++; } /* we're done with the positions hash, so kill it off */ SvREFCNT_dec(pos_hash); /* start the term vector string */ tv_string_sv = newSV(20); SvPOK_on(tv_string_sv); num_bytes = Kino_OutStream_encode_vint(num_postings, vint_buf); sv_catpvn(tv_string_sv, vint_buf, num_bytes); /* sort the posting lists lexically */ sortsv(AvARRAY(out_av), num_postings, Perl_sv_cmp); /* iterate through the array, making changes to the serialized scalars */ for (i = 0; i < num_postings; i++) { serialized_sv = *(av_fetch(out_av, i, 0)); /* find the beginning of the term text */ text = SvPV(serialized_sv, fake_len); text += KINO_FIELD_NUM_LEN; /* save the text_len; we'll move it forward later */ end_ptr = SvEND(serialized_sv) - TEXT_LEN_LEN; text_len = Kino_decode_bigend_U16( end_ptr ); Kino_encode_bigend_U16(text_len, text_len_buf); source_ptr = SvPVX(serialized_sv) + KINO_FIELD_NUM_LEN + text_len + NULL_BYTE_LEN + DOC_NUM_LEN; source_u32 = (U32*)source_ptr; dest_u32 = source_u32; end_u32 = (U32*)end_ptr; /* append the string diff to the tv_string */ overlap = Kino_StrHelp_string_diff(last_text, text, last_len, text_len); num_bytes = Kino_OutStream_encode_vint(overlap, vint_buf); sv_catpvn( tv_string_sv, vint_buf, num_bytes ); num_bytes = Kino_OutStream_encode_vint( (text_len - overlap), vint_buf ); sv_catpvn( tv_string_sv, vint_buf, num_bytes ); sv_catpvn( tv_string_sv, (text + overlap), (text_len - overlap) ); /* append the number of positions for this term */ num_positions = SvCUR(serialized_sv) - KINO_FIELD_NUM_LEN - text_len - NULL_BYTE_LEN - DOC_NUM_LEN - TEXT_LEN_LEN; num_positions /= POSDATA_LEN; num_bytes = Kino_OutStream_encode_vint(num_positions, vint_buf); sv_catpvn( tv_string_sv, vint_buf, num_bytes ); while (source_u32 < end_u32) { /* keep only the positions in the serialized scalars */ num_bytes = Kino_OutStream_encode_vint(*source_u32, vint_buf); sv_catpvn( tv_string_sv, vint_buf, num_bytes ); *dest_u32++ = *source_u32++; /* add start_offset to tv_string */ num_bytes = Kino_OutStream_encode_vint(*source_u32, vint_buf); sv_catpvn( tv_string_sv, vint_buf, num_bytes ); source_u32++; /* add end_offset to tv_string */ num_bytes = Kino_OutStream_encode_vint(*source_u32, vint_buf); sv_catpvn( tv_string_sv, vint_buf, num_bytes ); source_u32++; } /* restore the text_len and close the scalar */ dest_ptr = (char*)dest_u32; Copy(text_len_buf, dest_ptr, TEXT_LEN_LEN, char); dest_ptr += TEXT_LEN_LEN; len = dest_ptr - SvPVX(serialized_sv); SvCUR_set(serialized_sv, len); last_text = text; last_len = text_len; } /* store the postings array and the term vector string */ SvREFCNT_dec(batch->tv_string); batch->tv_string = tv_string_sv; SvREFCNT_dec(batch->postings); batch->postings = out_av;}__POD__=head1 NAMEKinoSearch::Analysis::TokenBatch - a collection of tokens=head1 SYNOPSIS while ( $batch->next ) { $batch->set_text( lc( $batch->get_text ) ); }=head1 EXPERIMENTAL API TokenBatch's API should be considered experimental and is likely to change.=head1 DESCRIPTIONA TokenBatch is a collection of L<Tokens|KinoSearch::Analysis::Token> whichyou can add to, then iterate over. =head1 METHODS=head2 new my $batch = KinoSearch::Analysis::TokenBatch->new;Constructor.=head2 append $batch->append( $text, $start_offset, $end_offset, $pos_inc );Add a Token to the end of the batch. Accepts either three or four arguments:text, start_offset, end_offset, and an optional position increment whichdefaults to 1 if not supplied. For a description of what these argumentsmean, see the docs for L<Token|KinoSearch::Analysis::Token>.=head2 next while ( $batch->next ) { # ... }Proceed to the next token in the TokenBatch. Returns true if the TokenBatchends up located at valid token.=head1 ACCESSOR METHODSAll of TokenBatch's accessor methods affect the current Token. Calling any ofthese methods when the TokenBatch is not located at a valid Token will triggeran exception.=head2 set_text get_text Set/get the text of the current Token.=head2 set_start_offset get_start_offsetSet/get the start_offset of the current Token.=head2 set_end_offset get_end_offsetSet/get the end_offset of the current Token.=head2 set_pos_inc get_pos_incSet/get the position increment of the current Token.=head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -