📄 nutchanalysis.java
字号:
/* Generated By:JavaCC: Do not edit this line. NutchAnalysis.java */package org.apache.nutch.analysis;import java.io.StringReader;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.searcher.Query;import org.apache.nutch.searcher.QueryFilters;import org.apache.nutch.searcher.Query.Clause;import org.apache.nutch.util.NutchConfiguration;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.TokenStream;import java.io.*;import java.util.*;/** The JavaCC-generated Nutch lexical analyzer and query parser. */public class NutchAnalysis implements NutchAnalysisConstants { private static final String[] STOP_WORDS = { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); private Analyzer analyzer = null; private String queryString; private QueryFilters queryFilters; /** Constructs a nutch analysis. */ public NutchAnalysis(String query, Analyzer analyzer) { this(new FastCharStream(new StringReader(query))); this.analyzer = analyzer; } /** True iff word is a stop word. Stop words are only removed from queries. * Every word is indexed. */ public static boolean isStopWord(String word) { return STOP_SET.contains(word); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Configuration conf) throws IOException { return parseQuery(queryString, null, conf); } /** Construct a query parser for the text in a reader. */ public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf) throws IOException { NutchAnalysis parser = new NutchAnalysis( queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf)); parser.queryString = queryString; parser.queryFilters = new QueryFilters(conf); return parser.parse(conf); } /** For debugging. */ public static void main(String[] args) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while (true) { System.out.print("Query: "); String line = in.readLine(); System.out.println(parseQuery(line, NutchConfiguration.create())); } }/** Parse a query. */ final public Query parse(Configuration conf) throws ParseException { Query query = new Query(conf); ArrayList terms; Token token; String field; boolean stop; boolean prohibited; nonOpOrTerm(); label_1: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WORD: case ACRONYM: case SIGRAM: case PLUS: case MINUS: case QUOTE: ; break; default: jj_la1[0] = jj_gen; break label_1; } stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: jj_consume_token(PLUS); stop=false; break; case MINUS: jj_consume_token(MINUS); stop=false;prohibited=true; break; default: jj_la1[1] = jj_gen; jj_consume_token(-1); throw new ParseException(); } break; default: jj_la1[2] = jj_gen; ; } if (jj_2_1(2147483647)) { token = jj_consume_token(WORD); jj_consume_token(COLON); field = token.image; } else { ; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case QUOTE: terms = phrase(field); stop=false; break; case WORD: case ACRONYM: case SIGRAM: // quoted terms or terms = compound(field); break; default: jj_la1[3] = jj_gen; jj_consume_token(-1); throw new ParseException(); } nonOpOrTerm(); String[] array = (String[])terms.toArray(new String[terms.size()]); if (stop && field == Clause.DEFAULT_FIELD && terms.size()==1 && isStopWord(array[0])) { // ignore stop words only when single, unadorned terms in default field } else { if (prohibited) query.addProhibitedPhrase(array, field); else query.addRequiredPhrase(array, field); } } {if (true) return query;} throw new Error("Missing return statement in function"); }/** Parse an explcitly quoted phrase query. Note that this may return a single * term, a trivial phrase.*/ final public ArrayList phrase(String field) throws ParseException { int start; int end; ArrayList result = new ArrayList(); String term; jj_consume_token(QUOTE); start = token.endColumn; label_2: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: case WHITE: ; break; default: jj_la1[4] = jj_gen; break label_2; } nonTerm(); } label_3: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WORD: case ACRONYM: case SIGRAM: ; break; default: jj_la1[5] = jj_gen; break label_3; } term = term(); result.add(term); label_4: while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: case WHITE: ; break; default: jj_la1[6] = jj_gen; break label_4; } nonTerm(); } } end = token.endColumn; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case QUOTE: jj_consume_token(QUOTE); break; case 0: jj_consume_token(0); break; default: jj_la1[7] = jj_gen; jj_consume_token(-1); throw new ParseException(); } if (this.queryFilters.isRawField(field)) { result.clear(); result.add(queryString.substring(start, end)); } {if (true) return result;} throw new Error("Missing return statement in function"); }/** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters. Note that * htis may return a single term, a trivial compound. */ final public ArrayList compound(String field) throws ParseException { int start; ArrayList result = new ArrayList(); String term; StringBuffer terms = new StringBuffer(); start = token.endColumn; term = term(); terms.append(term).append(" "); //result.add(term); label_5: while (true) { if (jj_2_2(2147483647)) { ; } else { break label_5; } label_6: while (true) { infix(); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: ; break; default: jj_la1[8] = jj_gen; break label_6; } } term = term(); terms.append(term).append(" "); //result.add(term); } if (this.queryFilters.isRawField(field)) {// result.clear(); result.add(queryString.substring(start, token.endColumn)); } else { org.apache.lucene.analysis.Token token; TokenStream tokens = analyzer.tokenStream( field, new StringReader(terms.toString())); while (true) { try { token = tokens.next(); } catch (IOException e) { token = null; } if (token == null) { break; } result.add(token.termText()); } try { tokens.close(); } catch (IOException e) { // ignore } } {if (true) return result;} throw new Error("Missing return statement in function"); }/** Parse a single term. */ final public String term() throws ParseException { Token token; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WORD: token = jj_consume_token(WORD); break; case ACRONYM: token = jj_consume_token(ACRONYM); break; case SIGRAM: token = jj_consume_token(SIGRAM); break; default: jj_la1[9] = jj_gen; jj_consume_token(-1); throw new ParseException(); } {if (true) return token.image;} throw new Error("Missing return statement in function"); }/** Parse anything but a term or a quote. */ final public void nonTerm() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WHITE: jj_consume_token(WHITE); break; case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: infix(); break; default: jj_la1[10] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } final public void nonTermOrEOF() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: case MINUS: case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: case WHITE: nonTerm(); break; case 0: jj_consume_token(0); break; default: jj_la1[11] = jj_gen; jj_consume_token(-1); throw new ParseException(); } }/** Parse anything but a term or an operator (plur or minus or quote). */ final public void nonOpOrTerm() throws ParseException { label_7: while (true) { if (jj_2_3(2)) { ; } else { break label_7; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case WHITE: jj_consume_token(WHITE); break; case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: nonOpInfix(); break; case PLUS: case MINUS: switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: jj_consume_token(PLUS); break; case MINUS: jj_consume_token(MINUS); break; default: jj_la1[12] = jj_gen; jj_consume_token(-1); throw new ParseException(); } nonTermOrEOF(); break; default: jj_la1[13] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } }/** Characters which can be used to form compound terms. */ final public void infix() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case PLUS: jj_consume_token(PLUS); break; case MINUS: jj_consume_token(MINUS); break; case COLON: case SLASH: case DOT: case ATSIGN: case APOSTROPHE: nonOpInfix(); break; default: jj_la1[14] = jj_gen; jj_consume_token(-1); throw new ParseException(); } }/** Parse infix characters except plus and minus. */ final public void nonOpInfix() throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case COLON: jj_consume_token(COLON); break; case SLASH: jj_consume_token(SLASH); break; case DOT: jj_consume_token(DOT); break; case ATSIGN: jj_consume_token(ATSIGN); break; case APOSTROPHE: jj_consume_token(APOSTROPHE); break; default: jj_la1[15] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } final private boolean jj_2_1(int xla) { jj_la = xla; jj_lastpos = jj_scanpos = token;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -