📄 nutchanalysis.jj

📁 nutch0.8源码
💻 JJ
字号:
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** JavaCC code for the Nutch lexical analyzer. */options {  STATIC = false;  USER_CHAR_STREAM = true;  OPTIMIZE_TOKEN_MANAGER = true;  UNICODE_INPUT = true;//DEBUG_TOKEN_MANAGER = true;}PARSER_BEGIN(NutchAnalysis)package org.apache.nutch.analysis;import java.io.StringReader;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.searcher.Query;import org.apache.nutch.searcher.QueryFilters;import org.apache.nutch.searcher.Query.Clause;import org.apache.nutch.util.NutchConfiguration;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.TokenStream;import java.io.*;import java.util.*;/** The JavaCC-generated Nutch lexical analyzer and query parser. */public class NutchAnalysis {  private static final String[] STOP_WORDS = {    "a", "and", "are", "as", "at", "be", "but", "by",    "for", "if", "in", "into", "is", "it",    "no", "not", "of", "on", "or", "s", "such",    "t", "that", "the", "their", "then", "there", "these",    "they", "this", "to", "was", "will", "with"  };  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);  private Analyzer analyzer = null;  private String queryString;  private QueryFilters queryFilters;    /** Constructs a nutch analysis. */  public NutchAnalysis(String query, Analyzer analyzer) {    this(new FastCharStream(new StringReader(query)));    this.analyzer = analyzer;  }  /** True iff word is a stop word.  Stop words are only removed from queries.   * Every word is indexed.  */  public static boolean isStopWord(String word) {    return STOP_SET.contains(word);  }  /** Construct a query parser for the text in a reader. */  public static Query parseQuery(String queryString, Configuration conf) throws IOException {    return parseQuery(queryString, null, conf);  }  /** Construct a query parser for the text in a reader. */  public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)    throws IOException {    NutchAnalysis parser = new NutchAnalysis(          queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));    parser.queryString = queryString;    parser.queryFilters = new QueryFilters(conf);    return parser.parse(conf);  }  /** For debugging. */  public static void main(String[] args) throws Exception {    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));    while (true) {      System.out.print("Query: ");      String line = in.readLine();      System.out.println(parseQuery(line, NutchConfiguration.create()));    }  }}PARSER_END(NutchAnalysis)TOKEN_MGR_DECLS : {  /** Constructs a token manager for the provided Reader. */  public NutchAnalysisTokenManager(Reader reader) {    this(new FastCharStream(reader));  }}TOKEN : {					  // token regular expressions  // basic word -- lowercase it<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>  { matchedToken.image = matchedToken.image.toLowerCase(); }  // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed| <ACRONYM: <LETTER> "." (<LETTER> ".")+ >     {                                             // remove dots      for (int i = 0; i < image.length(); i++) {	if (image.charAt(i) == '.')	  image.deleteCharAt(i--);      }      matchedToken.image = image.toString().toLowerCase();    }  // chinese, japanese and korean characters| <SIGRAM: <CJK> >   // irregular words| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>| <#C_PLUS_PLUS: ("C"|"c") "++" >| <#C_SHARP: ("C"|"c") "#" >  // query syntax characters| <PLUS: "+" >| <MINUS: "-" >| <QUOTE: "\"" >| <COLON: ":" >| <SLASH: "/" >| <DOT: "." >| <ATSIGN: "@" >| <APOSTROPHE: "'" >| <WHITE: ~[] >                                   // treat unrecognized chars                                                  // as whitespace// primitive, non-token patterns| <#WORD_PUNCT: ("_"|"&")>                        // allowed anywhere in words| < #LETTER:					  // alphabets    [        "\u0041"-"\u005a",        "\u0061"-"\u007a",        "\u00c0"-"\u00d6",        "\u00d8"-"\u00f6",        "\u00f8"-"\u00ff",        "\u0100"-"\u1fff"    ]    >|  <#CJK:                                        // non-alphabets      [       "\u3040"-"\u318f",       "\u3300"-"\u337f",       "\u3400"-"\u3d2d",       "\u4e00"-"\u9fff",       "\uf900"-"\ufaff"      ]    >    | < #DIGIT:					  // unicode digits      [       "\u0030"-"\u0039",       "\u0660"-"\u0669",       "\u06f0"-"\u06f9",       "\u0966"-"\u096f",       "\u09e6"-"\u09ef",       "\u0a66"-"\u0a6f",       "\u0ae6"-"\u0aef",       "\u0b66"-"\u0b6f",       "\u0be7"-"\u0bef",       "\u0c66"-"\u0c6f",       "\u0ce6"-"\u0cef",       "\u0d66"-"\u0d6f",       "\u0e50"-"\u0e59",       "\u0ed0"-"\u0ed9",       "\u1040"-"\u1049"      ]  >}/** Parse a query. */Query parse(Configuration conf) :{  Query query = new Query(conf);  ArrayList terms;  Token token;  String field;  boolean stop;  boolean prohibited;}{  nonOpOrTerm()                                   // skip noise  (    { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }                                                  // optional + or - operator    ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))?                                                  // optional field spec.    ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field)))      token=<WORD> <COLON> { field = token.image; } )?    ( terms=phrase(field) {stop=false;} |         // quoted terms or      terms=compound(field))                      // single or compound term    nonOpOrTerm()                                 // skip noise    {      String[] array = (String[])terms.toArray(new String[terms.size()]);      if (stop          && field == Clause.DEFAULT_FIELD          && terms.size()==1          && isStopWord(array[0])) {        // ignore stop words only when single, unadorned terms in default field      } else {        if (prohibited)          query.addProhibitedPhrase(array, field);        else          query.addRequiredPhrase(array, field);      }    }  )*    { return query; }}/** Parse an explcitly quoted phrase query.  Note that this may return a single * term, a trivial phrase.*/ArrayList phrase(String field) :{  int start;  int end;  ArrayList result = new ArrayList();  String term;}{  <QUOTE>  { start = token.endColumn; }    (nonTerm())*                                    // skip noise  ( term = term() { result.add(term); }           // parse a term    (nonTerm())*)*                                // skip noise  { end = token.endColumn; }  (<QUOTE>|<EOF>)      {    if (this.queryFilters.isRawField(field)) {      result.clear();      result.add(queryString.substring(start, end));    }    return result;  }}/** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters.  Note that * htis may return a single term, a trivial compound. */ArrayList compound(String field) :{  int start;  ArrayList result = new ArrayList();  String term;  StringBuffer terms = new StringBuffer();}{  { start = token.endColumn; }  term = term() {    terms.append(term).append(" ");    //result.add(term);  }  ( LOOKAHEAD( (infix())+ term() )    (infix())+    term = term() {      terms.append(term).append(" ");      //result.add(term);    })*  {    if (this.queryFilters.isRawField(field)) {//      result.clear();      result.add(queryString.substring(start, token.endColumn));    } else {      org.apache.lucene.analysis.Token token;      TokenStream tokens = analyzer.tokenStream(                              field, new StringReader(terms.toString()));      while (true) {        try {          token = tokens.next();        } catch (IOException e) {          token = null;        }        if (token == null) { break; }        result.add(token.termText());      }      try {        tokens.close();      } catch (IOException e) {        // ignore      }    }    return result;  }}/** Parse a single term. */String term() :{  Token token;}{  ( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>)  { return token.image; }}/** Parse anything but a term or a quote. */void nonTerm() :{}{  <WHITE> | infix()}void nonTermOrEOF() :{}{  nonTerm() | <EOF>}/** Parse anything but a term or an operator (plur or minus or quote). */void nonOpOrTerm() :{}{  (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*}/** Characters which can be used to form compound terms. */void infix() :{}{  <PLUS> | <MINUS> | nonOpInfix()}/** Parse infix characters except plus and minus. */void nonOpInfix() :{}{  <COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -