📄 similarity.pm
字号:
package KinoSearch::Search::Similarity;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Util::CClass );BEGIN { __PACKAGE__->init_instance_vars(); }# See _float_to_byte.*encode_norm = *_float_to_byte;*decode_norm = *_byte_to_float;# Calculate the Inverse Document Frequecy for one or more Term in a given# collection (the Searcher represents the collection).## If multiple Terms are supplied, their idfs are summed.sub idf { my ( $self, $term_or_terms, $searcher ) = @_; my $max_doc = $searcher->max_doc; my $terms = ref $term_or_terms eq 'ARRAY' ? $term_or_terms : [$term_or_terms]; return 1 unless $max_doc; # guard against log of zero error # accumulate IDF my $idf = 0; for my $term (@$terms) { my $doc_freq = $searcher->doc_freq($term); $idf += 1 + log( $max_doc / ( 1 + $searcher->doc_freq($term) ) ); } return $idf;}# Normalize a Query's weight so that it is comparable to other Queries.sub query_norm { my ( $self, $sum_of_squared_weights ) = @_; return 0 if ( $sum_of_squared_weights == 0 ); # guard against div by zero return ( 1 / sqrt($sum_of_squared_weights) );}# KLUDGE -- see comment at STORABLE_thaw.sub STORABLE_freeze { my ( $self, $cloning ) = @_; return if $cloning; return "1" }package KinoSearch::Search::TitleSimilarity;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Search::Similarity );sub new { my $self = shift->SUPER::new(@_); $self->_use_title_tf; return $self;}sub lengthnorm { return 0 unless $_[1]; return 1 / sqrt( $_[1] );}1;__END____XS__MODULE = KinoSearch PACKAGE = KinoSearch::Search::Similarity =begin commentKLUDGE!!Rather than attempt to serialize a Similarity, we just create a new one.=end comment=cutvoidSTORABLE_thaw(blank_obj, cloning, serialized) SV *blank_obj; SV *cloning; SV *serialized;PPCODE:{ Similarity *sim = Kino_Sim_new(); SV *deep_obj = SvRV(blank_obj); sv_setiv(deep_obj, PTR2IV(sim));}voidnew(either_sv) SV *either_sv;PREINIT: char *class; Similarity *sim;PPCODE: /* determine the class */ class = sv_isobject(either_sv) ? sv_reftype(either_sv, 0) : SvPV_nolen(either_sv); /* build object */ sim = Kino_Sim_new(); ST(0) = sv_newmortal(); sv_setref_pv(ST(0), class, (void*)sim); XSRETURN(1);=for commentProvide a normalization factor for a field based on the square-root of thenumber of terms in it.=cutfloatlengthnorm(sim, num_terms) Similarity *sim; U32 num_terms;CODE: num_terms = num_terms < 100 ? 100 : num_terms; RETVAL = (float)1 / sqrt(num_terms);OUTPUT: RETVAL=for commentReturn a score factor based on the frequency of a term in a given document.The default implementation is sqrt(freq). Other implementations typicallyproduce ascending scores with ascending freqs, since the more times a docmatches, the more relevant it is likely to be.=cutfloattf(sim, freq) Similarity *sim; U32 freq;CODE: RETVAL = sim->tf(sim, freq);OUTPUT: RETVAL=for comment_float_to_byte and _byte_to_float encode and decode between 32-bit IEEEfloating point numbers and a 5-bit exponent, 3-bit mantissa float. The rangecovered by the single-byte encoding is 7x10^9 to 2x10^-9. The accuracy isabout one significant decimal digit.=cutSV*_float_to_byte(sim, f) Similarity *sim; float f;PREINIT: char b;CODE: b = Kino_Sim_float2byte(sim, f); RETVAL = newSVpv(&b, 1);OUTPUT: RETVALfloat_byte_to_float(sim, b) Similarity *sim; char b;CODE: RETVAL = Kino_Sim_byte2float(sim, b);OUTPUT: RETVAL=for commentThe norm_decoder caches the 256 possible byte => float pairs, obviating theneed to call decode_norm over and over for a scoring implementation thatknows how to use it.=cutSV*get_norm_decoder(sim) Similarity *sim;CODE: RETVAL = newSVpv( (char*)sim->norm_decoder, (256 * sizeof(float)) );OUTPUT: RETVALfloatcoord(sim, overlap, max_overlap) Similarity *sim; U32 overlap; U32 max_overlap;CODE: RETVAL = sim->coord(sim, overlap, max_overlap);OUTPUT: RETVALvoid_use_title_tf(sim) Similarity *sim;PPCODE: sim->tf = Kino_Sim_title_tf;voidDESTROY(sim) Similarity *sim;PPCODE: Kino_Sim_destroy(sim); __H__#ifndef H_KINO_SIMILARITY#define H_KINO_SIMILARITY 1#include "EXTERN.h"#include "perl.h"#include "XSUB.h"#include "KinoSearchUtilMemManager.h"typedef struct similarity { float (*tf)(struct similarity*, float); float (*coord)(struct similarity*, U32, U32); float *norm_decoder;} Similarity;Similarity* Kino_Sim_new();float Kino_Sim_default_tf(Similarity*, float);float Kino_Sim_title_tf(Similarity*, float);char Kino_Sim_float2byte(Similarity*, float);float Kino_Sim_byte2float(Similarity*, char);float Kino_Sim_coord(Similarity*, U32, U32);void Kino_Sim_destroy(Similarity*);#endif /* include guard */__C__#include "KinoSearchSearchSimilarity.h"Similarity*Kino_Sim_new() { int i; unsigned char aUChar; Similarity *sim; Kino_New(0, sim, 1, Similarity); /* cache decoded norms */ Kino_New(0, sim->norm_decoder, 256, float); for (i = 0; i < 256; i++) { aUChar = i; sim->norm_decoder[i] = Kino_Sim_byte2float(sim, (char)aUChar); } sim->tf = Kino_Sim_default_tf; sim->coord = Kino_Sim_coord; return sim;}floatKino_Sim_default_tf(Similarity *sim, float freq) { return( sqrt(freq) );}floatKino_Sim_title_tf(Similarity *sim, float freq) { return 1.0;}char Kino_Sim_float2byte(Similarity *sim, float f) { char norm; I32 mantissa; I32 exponent; I32 bits; if (f < 0.0) f = 0.0; if (f == 0.0) { norm = 0; } else { bits = *(I32*)&f; mantissa = (bits & 0xffffff) >> 21; exponent = (((bits >> 24) & 0x7f)-63) + 15; if (exponent > 31) { exponent = 31; mantissa = 7; } if (exponent < 0) { exponent = 0; mantissa = 1; } norm = (char)((exponent << 3) | mantissa); } return norm;}floatKino_Sim_byte2float(Similarity *sim, char b) { I32 mantissa; I32 exponent; I32 result; if (b == 0) { result = 0; } else { mantissa = b & 7; exponent = (b >> 3) & 31; result = ((exponent+(63-15)) << 24) | (mantissa << 21); } return *(float*)&result;}/* Calculate a score factor based on the number of terms which match. */floatKino_Sim_coord(Similarity *sim, U32 overlap, U32 max_overlap) { if (max_overlap == 0) return 1; return (float)overlap / (float)max_overlap;}voidKino_Sim_destroy(Similarity *sim) { Kino_Safefree(sim->norm_decoder); Kino_Safefree(sim);}__POD__=begin devdocs=head1 NAMEKinoSearch::Search::Similarity - calculate how closely two items match=head1 DESCRIPTIONThe Similarity class encapsulates some of the math used when calculatingscores.TitleSimilarity is tuned for best results with title fields.=head1 SEE ALSOThe Lucene equivalent of this class provides a thorough discussion of theLucene scoring algorithm, which KinoSearch implements. =head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.=end devdocs=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -