📄 nutchanalysis.jj
字号:
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** JavaCC code for the Nutch lexical analyzer. */options { STATIC = false; USER_CHAR_STREAM = true; OPTIMIZE_TOKEN_MANAGER = true; UNICODE_INPUT = true;//DEBUG_TOKEN_MANAGER = true;}PARSER_BEGIN(NutchAnalysis)package org.apache.nutch.analysis;import java.io.StringReader;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.searcher.Query;import org.apache.nutch.searcher.QueryFilters;import org.apache.nutch.searcher.Query.Clause;import org.apache.nutch.util.NutchConfiguration;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.TokenStream;import java.io.*;import java.util.*;/** The JavaCC-generated Nutch lexical analyzer and query parser. */public class NutchAnalysis { private static final String[] STOP_WORDS = { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); private Analyzer analyzer = null; private String queryString; private QueryFilters queryFilters; /** Constructs a nutch analysis. */ public NutchAnalysis(String query, Analyzer analyzer) { this(new FastCharStream(new StringReader(query))); this.analyzer = analyzer; } /** True iff word is a stop word. Stop words are only removed from queries. * Every word is indexed. */ public static boolean isStopWord(String word) { return STOP_SET.contains(word); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Configuration conf) throws IOException { return parseQuery(queryString, null, conf); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf) throws IOException { NutchAnalysis parser = new NutchAnalysis( queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); parser.queryString = queryString; parser.queryFilters = new QueryFilters(conf); return parser.parse(conf); } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Query: "); String line = in.readLine(); System.out.println(parseQuery(line, NutchConfiguration.create())); } }}PARSER_END(NutchAnalysis)TOKEN_MGR_DECLS : { /** Constructs a token manager for the provided Reader. */ public NutchAnalysisTokenManager(Reader reader) { this(new FastCharStream(reader)); }}TOKEN : { // token regular expressions // basic word -- lowercase it<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)> { matchedToken.image = matchedToken.image.toLowerCase(); } // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed| <ACRONYM: <LETTER> "." (<LETTER> ".")+ > { // remove dots for (int i = 0; i < image.length(); i++) { if (image.charAt(i) == '.') image.deleteCharAt(i--); } matchedToken.image = image.toString().toLowerCase(); } // chinese, japanese and korean characters| <SIGRAM: <CJK> > // irregular words| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>| <#C_PLUS_PLUS: ("C"|"c") "++" >| <#C_SHARP: ("C"|"c") "#" > // query syntax characters| <PLUS: "+" >| <MINUS: "-" >| <QUOTE: "\"" >| <COLON: ":" >| <SLASH: "/" >| <DOT: "." >| <ATSIGN: "@" >| <APOSTROPHE: "'" >| <WHITE: ~[] > // treat unrecognized chars // as whitespace// primitive, non-token patterns| <#WORD_PUNCT: ("_"|"&")> // allowed anywhere in words| < #LETTER: // alphabets [ "\u0041"-"\u005a", "\u0061"-"\u007a", "\u00c0"-"\u00d6", "\u00d8"-"\u00f6", "\u00f8"-"\u00ff", "\u0100"-"\u1fff" ] >| <#CJK: // non-alphabets [ "\u3040"-"\u318f", "\u3300"-"\u337f", "\u3400"-"\u3d2d", "\u4e00"-"\u9fff", "\uf900"-"\ufaff" ] > | < #DIGIT: // unicode digits [ "\u0030"-"\u0039", "\u0660"-"\u0669", "\u06f0"-"\u06f9", "\u0966"-"\u096f", "\u09e6"-"\u09ef", "\u0a66"-"\u0a6f", "\u0ae6"-"\u0aef", "\u0b66"-"\u0b6f", "\u0be7"-"\u0bef", "\u0c66"-"\u0c6f", "\u0ce6"-"\u0cef", "\u0d66"-"\u0d6f", "\u0e50"-"\u0e59", "\u0ed0"-"\u0ed9", "\u1040"-"\u1049" ] >}/** Parse a query. */Query parse(Configuration conf) :{ Query query = new Query(conf); ArrayList terms; Token token; String field; boolean stop; boolean prohibited;}{ nonOpOrTerm() // skip noise ( { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; } // optional + or - operator ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))? // optional field spec. ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field))) token=<WORD> <COLON> { field = token.image; } )? ( terms=phrase(field) {stop=false;} | // quoted terms or terms=compound(field)) // single or compound term nonOpOrTerm() // skip noise { String[] array = (String[])terms.toArray(new String[terms.size()]); if (stop && field == Clause.DEFAULT_FIELD && terms.size()==1 && isStopWord(array[0])) { // ignore stop words only when single, unadorned terms in default field } else { if (prohibited) query.addProhibitedPhrase(array, field); else query.addRequiredPhrase(array, field); } } )* { return query; }}/** Parse an explcitly quoted phrase query. Note that this may return a single * term, a trivial phrase.*/ArrayList phrase(String field) :{ int start; int end; ArrayList result = new ArrayList(); String term;}{ <QUOTE> { start = token.endColumn; } (nonTerm())* // skip noise ( term = term() { result.add(term); } // parse a term (nonTerm())*)* // skip noise { end = token.endColumn; } (<QUOTE>|<EOF>) { if (this.queryFilters.isRawField(field)) { result.clear(); result.add(queryString.substring(start, end)); } return result; }}/** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters. Note that * htis may return a single term, a trivial compound. */ArrayList compound(String field) :{ int start; ArrayList result = new ArrayList(); String term; StringBuffer terms = new StringBuffer();}{ { start = token.endColumn; } term = term() { terms.append(term).append(" "); //result.add(term); } ( LOOKAHEAD( (infix())+ term() ) (infix())+ term = term() { terms.append(term).append(" "); //result.add(term); })* { if (this.queryFilters.isRawField(field)) {// result.clear(); result.add(queryString.substring(start, token.endColumn)); } else { org.apache.lucene.analysis.Token token; TokenStream tokens = analyzer.tokenStream( field, new StringReader(terms.toString())); while (true) { try { token = tokens.next(); } catch (IOException e) { token = null; } if (token == null) { break; } result.add(token.termText()); } try { tokens.close(); } catch (IOException e) { // ignore } } return result; }}/** Parse a single term. */String term() :{ Token token;}{ ( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>) { return token.image; }}/** Parse anything but a term or a quote. */void nonTerm() :{}{ <WHITE> | infix()}void nonTermOrEOF() :{}{ nonTerm() | <EOF>}/** Parse anything but a term or an operator (plur or minus or quote). */void nonOpOrTerm() :{}{ (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*}/** Characters which can be used to form compound terms. */void infix() :{}{ <PLUS> | <MINUS> | nonOpInfix()}/** Parse infix characters except plus and minus. */void nonOpInfix() :{}{ <COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -