📄 nutchanalysis.java

📁 nutch0.8源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Generated By:JavaCC: Do not edit this line. NutchAnalysis.java */package org.apache.nutch.analysis;import java.io.StringReader;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.searcher.Query;import org.apache.nutch.searcher.QueryFilters;import org.apache.nutch.searcher.Query.Clause;import org.apache.nutch.util.NutchConfiguration;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.TokenStream;import java.io.*;import java.util.*;/** The JavaCC-generated Nutch lexical analyzer and query parser. */public class NutchAnalysis implements NutchAnalysisConstants {  private static final String[] STOP_WORDS = {    "a", "and", "are", "as", "at", "be", "but", "by",    "for", "if", "in", "into", "is", "it",    "no", "not", "of", "on", "or", "s", "such",    "t", "that", "the", "their", "then", "there", "these",    "they", "this", "to", "was", "will", "with"  };  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);  private Analyzer analyzer = null;  private String queryString;  private QueryFilters queryFilters;  /** Constructs a nutch analysis. */  public NutchAnalysis(String query, Analyzer analyzer) {    this(new FastCharStream(new StringReader(query)));    this.analyzer = analyzer;  }  /** True iff word is a stop word.  Stop words are only removed from queries.   * Every word is indexed.  */  public static boolean isStopWord(String word) {    return STOP_SET.contains(word);  }  /** Construct a query parser for the text in a reader. */  public static Query parseQuery(String queryString, Configuration conf) throws IOException {    return parseQuery(queryString, null, conf);  }  /** Construct a query parser for the text in a reader. */  public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)    throws IOException {    NutchAnalysis parser = new NutchAnalysis(          queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));    parser.queryString = queryString;    parser.queryFilters = new QueryFilters(conf);    return parser.parse(conf);  }  /** For debugging. */  public static void main(String[] args) throws Exception {    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));    while (true) {      System.out.print("Query: ");      String line = in.readLine();      System.out.println(parseQuery(line, NutchConfiguration.create()));    }  }/** Parse a query. */  final public Query parse(Configuration conf) throws ParseException {  Query query = new Query(conf);  ArrayList terms;  Token token;  String field;  boolean stop;  boolean prohibited;    nonOpOrTerm();    label_1:    while (true) {      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case WORD:      case ACRONYM:      case SIGRAM:      case PLUS:      case MINUS:      case QUOTE:        ;        break;      default:        jj_la1[0] = jj_gen;        break label_1;      }      stop=true; prohibited=false; field = Clause.DEFAULT_FIELD;      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case PLUS:      case MINUS:        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {        case PLUS:          jj_consume_token(PLUS);              stop=false;          break;        case MINUS:          jj_consume_token(MINUS);                                        stop=false;prohibited=true;          break;        default:          jj_la1[1] = jj_gen;          jj_consume_token(-1);          throw new ParseException();        }        break;      default:        jj_la1[2] = jj_gen;        ;      }      if (jj_2_1(2147483647)) {        token = jj_consume_token(WORD);        jj_consume_token(COLON);                             field = token.image;      } else {        ;      }      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case QUOTE:        terms = phrase(field);                           stop=false;        break;      case WORD:      case ACRONYM:      case SIGRAM:        // quoted terms or              terms = compound(field);        break;      default:        jj_la1[3] = jj_gen;        jj_consume_token(-1);        throw new ParseException();      }      nonOpOrTerm();      String[] array = (String[])terms.toArray(new String[terms.size()]);      if (stop          && field == Clause.DEFAULT_FIELD          && terms.size()==1          && isStopWord(array[0])) {        // ignore stop words only when single, unadorned terms in default field      } else {        if (prohibited)          query.addProhibitedPhrase(array, field);        else          query.addRequiredPhrase(array, field);      }    }    {if (true) return query;}    throw new Error("Missing return statement in function");  }/** Parse an explcitly quoted phrase query.  Note that this may return a single * term, a trivial phrase.*/  final public ArrayList phrase(String field) throws ParseException {  int start;  int end;  ArrayList result = new ArrayList();  String term;    jj_consume_token(QUOTE);    start = token.endColumn;    label_2:    while (true) {      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case PLUS:      case MINUS:      case COLON:      case SLASH:      case DOT:      case ATSIGN:      case APOSTROPHE:      case WHITE:        ;        break;      default:        jj_la1[4] = jj_gen;        break label_2;      }      nonTerm();    }    label_3:    while (true) {      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case WORD:      case ACRONYM:      case SIGRAM:        ;        break;      default:        jj_la1[5] = jj_gen;        break label_3;      }      term = term();                    result.add(term);      label_4:      while (true) {        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {        case PLUS:        case MINUS:        case COLON:        case SLASH:        case DOT:        case ATSIGN:        case APOSTROPHE:        case WHITE:          ;          break;        default:          jj_la1[6] = jj_gen;          break label_4;        }        nonTerm();      }    }    end = token.endColumn;    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case QUOTE:      jj_consume_token(QUOTE);      break;    case 0:      jj_consume_token(0);      break;    default:      jj_la1[7] = jj_gen;      jj_consume_token(-1);      throw new ParseException();    }    if (this.queryFilters.isRawField(field)) {      result.clear();      result.add(queryString.substring(start, end));    }    {if (true) return result;}    throw new Error("Missing return statement in function");  }/** Parse a compound term that is interpreted as an implicit phrase query. * Compounds are a sequence of terms separated by infix characters.  Note that * htis may return a single term, a trivial compound. */  final public ArrayList compound(String field) throws ParseException {  int start;  ArrayList result = new ArrayList();  String term;  StringBuffer terms = new StringBuffer();    start = token.endColumn;    term = term();    terms.append(term).append(" ");    //result.add(term);    label_5:    while (true) {      if (jj_2_2(2147483647)) {        ;      } else {        break label_5;      }      label_6:      while (true) {        infix();        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {        case PLUS:        case MINUS:        case COLON:        case SLASH:        case DOT:        case ATSIGN:        case APOSTROPHE:          ;          break;        default:          jj_la1[8] = jj_gen;          break label_6;        }      }      term = term();      terms.append(term).append(" ");      //result.add(term);    }    if (this.queryFilters.isRawField(field)) {//      result.clear();      result.add(queryString.substring(start, token.endColumn));    } else {      org.apache.lucene.analysis.Token token;      TokenStream tokens = analyzer.tokenStream(                              field, new StringReader(terms.toString()));      while (true) {        try {          token = tokens.next();        } catch (IOException e) {          token = null;        }        if (token == null) { break; }        result.add(token.termText());      }      try {        tokens.close();      } catch (IOException e) {        // ignore      }    }    {if (true) return result;}    throw new Error("Missing return statement in function");  }/** Parse a single term. */  final public String term() throws ParseException {  Token token;    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case WORD:      token = jj_consume_token(WORD);      break;    case ACRONYM:      token = jj_consume_token(ACRONYM);      break;    case SIGRAM:      token = jj_consume_token(SIGRAM);      break;    default:      jj_la1[9] = jj_gen;      jj_consume_token(-1);      throw new ParseException();    }    {if (true) return token.image;}    throw new Error("Missing return statement in function");  }/** Parse anything but a term or a quote. */  final public void nonTerm() throws ParseException {    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case WHITE:      jj_consume_token(WHITE);      break;    case PLUS:    case MINUS:    case COLON:    case SLASH:    case DOT:    case ATSIGN:    case APOSTROPHE:      infix();      break;    default:      jj_la1[10] = jj_gen;      jj_consume_token(-1);      throw new ParseException();    }  }  final public void nonTermOrEOF() throws ParseException {    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case PLUS:    case MINUS:    case COLON:    case SLASH:    case DOT:    case ATSIGN:    case APOSTROPHE:    case WHITE:      nonTerm();      break;    case 0:      jj_consume_token(0);      break;    default:      jj_la1[11] = jj_gen;      jj_consume_token(-1);      throw new ParseException();    }  }/** Parse anything but a term or an operator (plur or minus or quote). */  final public void nonOpOrTerm() throws ParseException {    label_7:    while (true) {      if (jj_2_3(2)) {        ;      } else {        break label_7;      }      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {      case WHITE:        jj_consume_token(WHITE);        break;      case COLON:      case SLASH:      case DOT:      case ATSIGN:      case APOSTROPHE:        nonOpInfix();        break;      case PLUS:      case MINUS:        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {        case PLUS:          jj_consume_token(PLUS);          break;        case MINUS:          jj_consume_token(MINUS);          break;        default:          jj_la1[12] = jj_gen;          jj_consume_token(-1);          throw new ParseException();        }        nonTermOrEOF();        break;      default:        jj_la1[13] = jj_gen;        jj_consume_token(-1);        throw new ParseException();      }    }  }/** Characters which can be used to form compound terms. */  final public void infix() throws ParseException {    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case PLUS:      jj_consume_token(PLUS);      break;    case MINUS:      jj_consume_token(MINUS);      break;    case COLON:    case SLASH:    case DOT:    case ATSIGN:    case APOSTROPHE:      nonOpInfix();      break;    default:      jj_la1[14] = jj_gen;      jj_consume_token(-1);      throw new ParseException();    }  }/** Parse infix characters except plus and minus. */  final public void nonOpInfix() throws ParseException {    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {    case COLON:      jj_consume_token(COLON);      break;    case SLASH:      jj_consume_token(SLASH);      break;    case DOT:      jj_consume_token(DOT);      break;    case ATSIGN:      jj_consume_token(ATSIGN);      break;    case APOSTROPHE:      jj_consume_token(APOSTROPHE);      break;    default:      jj_la1[15] = jj_gen;      jj_consume_token(-1);      throw new ParseException();    }  }  final private boolean jj_2_1(int xla) {    jj_la = xla; jj_lastpos = jj_scanpos = token;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -