📄 queryparser.pm
字号:
package KinoSearch::QueryParser::QueryParser;use strict;use warnings;use KinoSearch::Util::ToolSet;use base qw( KinoSearch::Util::Class );BEGIN { __PACKAGE__->init_instance_vars( # constructor args / members analyzer => undef, default_boolop => 'OR', default_field => undef, # back compat fields => undef, # members bool_groups => undef, phrases => undef, bool_group_re => undef, phrase_re => undef, label_inc => 0, );}use KinoSearch::Analysis::TokenBatch;use KinoSearch::Analysis::Tokenizer;use KinoSearch::Search::BooleanQuery;use KinoSearch::Search::PhraseQuery;use KinoSearch::Search::TermQuery;use KinoSearch::Index::Term;sub init_instance { my $self = shift; $self->{bool_groups} = {}; $self->{phrases} = {}; croak("default_boolop must be either 'AND' or 'OR'") unless $self->{default_boolop} =~ /^(?:AND|OR)$/; # create a random string that presumably won't appear in a search string my @chars = ( 'A' .. 'Z' ); my $randstring = ''; $randstring .= $chars[ rand @chars ] for ( 1 .. 16 ); $self->{randstring} = $randstring; # create labels which won't appear in search strings $self->{phrase_re} = qr/^(_phrase$randstring\d+)/; $self->{bool_group_re} = qr/^(_boolgroup$randstring\d+)/; # verify fields param my $fields = defined $self->{fields} ? $self->{fields} : [ $self->{default_field} ]; croak("Required parameter 'fields' not supplied as arrayref") unless ( defined $fields and reftype($fields) eq 'ARRAY' ); $self->{fields} = $fields; # verify analyzer croak("Missing required param 'analyzer'") unless a_isa_b( $self->{analyzer}, 'KinoSearch::Analysis::Analyzer' );}# regex matching a quoted stringmy $quoted_re = qr/ " # opening quote ( # capture [^"]*? # anything not a quote ) (?:"|$) # closed by either a quote or end of string /xsm;# regex matching a parenthetical groupmy $paren_re = qr/ \( # opening paren ( # capture [^()]*? # anything not a paren ) (?:\)|$) # closed by paren or end of string /xsm;# regex matching a negating boolean operatormy $neg_re = qr/^(?: NOT\s+ # NOT followed by space |-(?=\S) # minus followed by something not-spacey )/xsm;# regex matching a requiring boolean operatormy $req_re = qr/^ \+(?=\S) # plus followed by something not-spacey /xsm;# regex matching a field indicatormy $field_re = qr/^ ( # capture [^"(:\s]+ # non-spacey string ) : # followed by : /xsm;sub parse { my ( $self, $qstring_orig, $default_fields ) = @_; $qstring_orig = '' unless defined $qstring_orig; $default_fields ||= $self->{fields}; my $default_boolop = $self->{default_boolop}; my @clauses; # substitute contiguous labels for phrases and boolean groups my $qstring = $self->_extract_phrases($qstring_orig); $qstring = $self->_extract_boolgroups($qstring); local $_ = $qstring; while ( bytes::length $_ ) { # fast-forward past whitespace next if s/^\s+//; my $occur = $default_boolop eq 'AND' ? 'MUST' : 'SHOULD'; if (s/^AND\s+//) { if (@clauses) { # require the previous clause (unless it's negated) if ( $clauses[-1]{occur} eq 'SHOULD' ) { $clauses[-1]{occur} = 'MUST'; } } # require this clause $occur = 'MUST'; } elsif (s/^OR\s+//) { if (@clauses) { $clauses[-1]{occur} = 'SHOULD'; } $occur = 'SHOULD'; } # detect tokens which cause this clause to be required or negated if (s/$neg_re//) { $occur = 'MUST_NOT'; } elsif (s/$req_re//) { $occur = 'MUST'; } # set the field my $fields = s/^$field_re// ? [$1] : $default_fields; # if a phrase label is detected... if (s/$self->{phrase_re}//) { my $query; # retreive the text and analyze it my $orig_phrase_text = delete $self->{phrases}{$1}; my $token_texts = $self->_analyze($orig_phrase_text); if (@$token_texts) { my $query = $self->_get_field_query( $fields, $token_texts ); push @clauses, { query => $query, occur => $occur } if defined $query; } } # if a label indicating a bool group is detected... elsif (s/$self->{bool_group_re}//) { # parse boolean subqueries recursively my $inner_text = delete $self->{bool_groups}{$1}; my $query = $self->parse($inner_text, $fields); push @clauses, { query => $query, occur => $occur }; } # what's left is probably a term elsif (s/([^"(\s]+)//) { my $token_texts = $self->_analyze($1); @$token_texts = grep { $_ ne '' } @$token_texts; if (@$token_texts) { my $query = $self->_get_field_query( $fields, $token_texts ); push @clauses, { occur => $occur, query => $query }; } } } if ( @clauses == 1 and $clauses[0]{occur} ne 'MUST_NOT' ) { # if it's just a simple query, return it unwrapped return $clauses[0]{query}; } else { # otherwise, build a boolean query my $bool_query = KinoSearch::Search::BooleanQuery->new; for my $clause (@clauses) { $bool_query->add_clause( query => $clause->{query}, occur => $clause->{occur}, ); } return $bool_query; }}# Wrap a TermQuery/PhraseQuery to deal with multiple fields.sub _get_field_query { my ( $self, $fields, $token_texts ) = @_; my @queries = grep { defined $_ } map { $self->_gen_single_field_query( $_, $token_texts ) } @$fields; if ( @queries == 0 ) { return; } elsif ( @queries == 1 ) { return $queries[0]; } else { my $wrapper_query = KinoSearch::Search::BooleanQuery->new; for my $query (@queries) { $wrapper_query->add_clause( query => $query, occur => 'SHOULD', ); } return $wrapper_query; }}# Create a TermQuery, a PhraseQuery, or nothing.sub _gen_single_field_query { my ( $self, $field, $token_texts ) = @_; if ( @$token_texts == 1 ) { my $term = KinoSearch::Index::Term->new( $field, $token_texts->[0] ); return KinoSearch::Search::TermQuery->new( term => $term ); } elsif ( @$token_texts > 1 ) { my $phrase_query = KinoSearch::Search::PhraseQuery->new; for my $token_text (@$token_texts) { $phrase_query->add_term( KinoSearch::Index::Term->new( $field, $token_text ), ); } return $phrase_query; }}# break a string into tokenssub _analyze { my ( $self, $string ) = @_; my $token_batch = KinoSearch::Analysis::TokenBatch->new; $token_batch->append( $string, 0, bytes::length($string) ); $token_batch = $self->{analyzer}->analyze($token_batch); my @token_texts; while ( $token_batch->next ) { push @token_texts, $token_batch->get_text; } return \@token_texts;}# replace all phrases with labelssub _extract_phrases { my ( $self, $qstring ) = @_; while ( $qstring =~ $quoted_re ) { my $label = sprintf( "_phrase$self->{randstring}%d", $self->{label_inc}++ ); $qstring =~ s/$quoted_re/$label /; # extra space for safety # store the phrase text for later retrieval $self->{phrases}{$label} = $1; } return $qstring;}# recursively replace boolean groupings with labels, innermost firstsub _extract_boolgroups { my ( $self, $qstring ) = @_; while ( $qstring =~ $paren_re ) { my $label = sprintf( "_boolgroup$self->{randstring}%d", $self->{label_inc}++ ); $qstring =~ s/$paren_re/$label /; # extra space for safety # store the text for later retrieval $self->{bool_groups}{$label} = $1; } return $qstring;}1;__END__=head1 NAMEKinoSearch::QueryParser::QueryParser - transform a string into a Query object=head1 SYNOPSIS my $query_parser = KinoSearch::QueryParser::QueryParser->new( analyzer => $analyzer, fields => [ 'bodytext' ], ); my $query = $query_parser->parse( $query_string ); my $hits = $searcher->search( query => $query );=head1 DESCRIPTIONThe QueryParser accepts search strings as input and produces Query objects,suitable for feeding into L<KinoSearch::Searcher|KinoSearch::Searcher>.=head2 SyntaxThe following constructs are recognized by QueryParser.=over=item *Boolean operators 'AND', 'OR', and 'AND NOT'.=item *Prepented +plus and -minus, indicating that the labeled entity should beeither required or forbidden -- be it a single word, a phrase, or aparenthetical group.=item *Logical groups, delimited by parentheses.=item *Phrases, delimited by double quotes.=item *Field-specific terms, in the form of C<fieldname:termtext>. (The fieldspecified by fieldname will be used instead of the QueryParser's defaultfields).A field can also be given to a logical group, in which case it is the same asif the field had been prepended onto every term in the group. For example:C<foo:(bar baz)> is the same as C<foo:bar foo:baz>.=back=head1 METHODS=head2 new my $query_parser = KinoSearch::QueryParser::QueryParser->new( analyzer => $analyzer, # required fields => [ 'bodytext' ], # required default_boolop => 'AND', # default: 'OR' );Constructor. Takes hash-style parameters:=over=item *B<analyzer> - An object which subclassesL<KinoSearch::Analysis::Analyzer|KinoSearch::Analysis::Analyzer>. ThisB<must> be identical to the Analyzer used at index-time, or the results won'tmatch up.=item *B<fields> - the names of the fields which will be searched against. Must besupplied as an arrayref.=item *B<default_field> - deprecated. Use C<fields> instead.=item *B<default_boolop> - two possible values: 'AND' and 'OR'. The default is 'OR',which means: return documents which match any of the query terms. If youwant only documents which match all of the query terms, set this to 'AND'.=back=head2 parse my $query = $query_parser->parse( $query_string );Turn a query string into a Query object. Depending on the contents of thequery string, the returned object could be any one of several subclasses ofL<KinoSearch::Search::Query|KinoSearch::Search::Query>.=head1 COPYRIGHTCopyright 2005-2007 Marvin Humphrey=head1 LICENSE, DISCLAIMER, BUGS, etc.See L<KinoSearch|KinoSearch> version 0.163.=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -