📄 nutchanalysis.java

📁 爬虫数据的改进,并修正了一些bug
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Generated By:JavaCC: Do not edit this line. NutchAnalysis.java */
package net.nutch.analysis;

import net.nutch.searcher.Query;
import net.nutch.searcher.QueryFilters;
import net.nutch.searcher.Query.Clause;

import org.apache.lucene.analysis.StopFilter;

import java.io.*;
import java.util.*;

/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis implements NutchAnalysisConstants {

  private static final String[] STOP_WORDS = {
    "a", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "s", "such",
    "t", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
  };

  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);

  private String queryString;

  /** True iff word is a stop word.  Stop words are only removed from queries.
   * Every word is indexed.  */
  public static boolean isStopWord(String word) {
    return STOP_SET.contains(word);
  }

  /** Construct a query parser for the text in a reader. */
  public static Query parseQuery(String queryString) throws IOException {
    NutchAnalysis parser =
      new NutchAnalysis(new FastCharStream(new StringReader(queryString)));
    parser.queryString = queryString;
    return parser.parse();
  }

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    while (true) {
      System.out.print("Query: ");
      String line = in.readLine();
      System.out.println(parseQuery(line));
    }
  }

/** Parse a query. */
  final public Query parse() throws ParseException {
  	Query query = new Query();
  	ArrayList terms;
  	Token token;
  	String field;
  	boolean stop;
  	boolean prohibited;
  	nonOpOrTerm();
  	label_1:
  		while (true) {
  			switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
  			case WORD:
  			case ACRONYM:
  			case SIGRAM:
  			case PLUS:
  			case MINUS:
  			case QUOTE:
  				;
  				break;
  			default:
  				jj_la1[0] = jj_gen;
  			break label_1;
  			}
  			stop=true; prohibited=false; field = Clause.DEFAULT_FIELD;
  			switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
  			case PLUS:
  			case MINUS:
  				switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
  				case PLUS:
  					jj_consume_token(PLUS);
  					stop=false;
  					break;
  				case MINUS:
  					jj_consume_token(MINUS);
  					stop=false;prohibited=true;
  					break;
  				default:
  					jj_la1[1] = jj_gen;
  				jj_consume_token(-1);
  				throw new ParseException();
  				}
  				break;
  			default:
  				jj_la1[2] = jj_gen;
  			;
  			}
  			if (jj_2_1(2147483647)) {
  				token = jj_consume_token(WORD);
  				jj_consume_token(COLON);
  				field = token.image;
  			} else {
  				;
  			}
  			switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
  			case QUOTE:
  				terms = phrase(field);
  				stop=false;
  				break;
  			case WORD:
  			case ACRONYM:
  			case SIGRAM:
  				// quoted terms or
  				terms = compound(field);
  				break;
  			default:
  				jj_la1[3] = jj_gen;
  			jj_consume_token(-1);
  			throw new ParseException();
  			}
  			nonOpOrTerm();
  			String[] array = (String[])terms.toArray(new String[terms.size()]);
  			
  			if (stop && terms.size()==1 && isStopWord(array[0])) {
  				// ignore stop words only when single, unadorned terms
  			} else {
  				if (prohibited)
  					query.addProhibitedPhrase(array, field);
  				else
  					query.addRequiredPhrase(array, field);
  			}
  		}
  	{if (true) return query;}
  	throw new Error("Missing return statement in function");
  }

/** Parse an explcitly quoted phrase query.  Note that this may return a single
 * term, a trivial phrase.*/
  final public ArrayList phrase(String field) throws ParseException {
  int start;
  int end;
  ArrayList result = new ArrayList();
  String term;
    jj_consume_token(QUOTE);
    start = token.endColumn;
    label_2:
    while (true) {
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case 0:
      case PLUS:
      case MINUS:
      case COLON:
      case SLASH:
      case DOT:
      case ATSIGN:
      case APOSTROPHE:
      case WHITE:
        ;
        break;
      default:
        jj_la1[4] = jj_gen;
        break label_2;
      }
      nonTerm();
    }
    label_3:
    while (true) {
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case WORD:
      case ACRONYM:
      case SIGRAM:
        ;
        break;
      default:
        jj_la1[5] = jj_gen;
        break label_3;
      }
      term = term();
                    result.add(term);
      label_4:
      while (true) {
        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
        case 0:
        case PLUS:
        case MINUS:
        case COLON:
        case SLASH:
        case DOT:
        case ATSIGN:
        case APOSTROPHE:
        case WHITE:
          ;
          break;
        default:
          jj_la1[6] = jj_gen;
          break label_4;
        }
        nonTerm();
      }
    }
    end = token.endColumn;
    jj_consume_token(QUOTE);
    if (QueryFilters.isRawField(field)) {
      result.clear();
      result.add(queryString.substring(start, end));
    }
    {if (true) return result;}
    throw new Error("Missing return statement in function");
  }

/** Parse a compound term that is interpreted as an implicit phrase query.
 * Compounds are a sequence of terms separated by infix characters.  Note that
 * htis may return a single term, a trivial compound. */
  final public ArrayList compound(String field) throws ParseException {
  int start;
  ArrayList result = new ArrayList();
  String term;
    start = token.endColumn;
    term = term();
                  result.add(term);
    label_5:
    while (true) {
      if (jj_2_2(2147483647)) {
        ;
      } else {
        break label_5;
      }
      label_6:
      while (true) {
        infix();
        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
        case PLUS:
        case MINUS:
        case COLON:
        case SLASH:
        case DOT:
        case ATSIGN:
        case APOSTROPHE:
          ;
          break;
        default:
          jj_la1[7] = jj_gen;
          break label_6;
        }
      }
      term = term();
                    result.add(term);
    }
    if (QueryFilters.isRawField(field)) {
      result.clear();
      result.add(queryString.substring(start, token.endColumn));
    }
    {if (true) return result;}
    throw new Error("Missing return statement in function");
  }

/** Parse a single term. */
  final public String term() throws ParseException {
  Token token;
    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
    case WORD:
      token = jj_consume_token(WORD);
      break;
    case ACRONYM:
      token = jj_consume_token(ACRONYM);
      break;
    case SIGRAM:
      token = jj_consume_token(SIGRAM);
      break;
    default:
      jj_la1[8] = jj_gen;
      jj_consume_token(-1);
      throw new ParseException();
    }
    {if (true) return token.image;}
    throw new Error("Missing return statement in function");
  }

/** Parse anything but a term or a quote. */
  final public void nonTerm() throws ParseException {
    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
    case WHITE:
      jj_consume_token(WHITE);
      break;
    case PLUS:
    case MINUS:
    case COLON:
    case SLASH:
    case DOT:
    case ATSIGN:
    case APOSTROPHE:
      infix();
      break;
    case 0:
      jj_consume_token(0);
      break;
    default:
      jj_la1[9] = jj_gen;
      jj_consume_token(-1);
      throw new ParseException();
    }
  }

/** Parse anything but a term or an operator (plur or minus or quote). */
  final public void nonOpOrTerm() throws ParseException {
    label_7:
    while (true) {
      if (jj_2_3(2)) {
        ;
      } else {
        break label_7;
      }
      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
      case WHITE:
        jj_consume_token(WHITE);
        break;
      case COLON:
      case SLASH:
      case DOT:
      case ATSIGN:
      case APOSTROPHE:
        nonOpInfix();
        break;
      case PLUS:
      case MINUS:
        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
        case PLUS:
          jj_consume_token(PLUS);
          break;
        case MINUS:
          jj_consume_token(MINUS);
          break;
        default:
          jj_la1[10] = jj_gen;
          jj_consume_token(-1);
          throw new ParseException();
        }
        nonTerm();
        break;
      default:
        jj_la1[11] = jj_gen;
        jj_consume_token(-1);
        throw new ParseException();
      }
    }
  }

/** Characters which can be used to form compound terms. */
  final public void infix() throws ParseException {
    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
    case PLUS:
      jj_consume_token(PLUS);
      break;
    case MINUS:
      jj_consume_token(MINUS);
      break;
    case COLON:
    case SLASH:
    case DOT:
    case ATSIGN:
    case APOSTROPHE:
      nonOpInfix();
      break;
    default:
      jj_la1[12] = jj_gen;
      jj_consume_token(-1);
      throw new ParseException();
    }
  }

/** Parse infix characters except plus and minus. */
  final public void nonOpInfix() throws ParseException {
    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
    case COLON:
      jj_consume_token(COLON);
      break;
    case SLASH:
      jj_consume_token(SLASH);
      break;
    case DOT:
      jj_consume_token(DOT);
      break;
    case ATSIGN:
      jj_consume_token(ATSIGN);
      break;
    case APOSTROPHE:
      jj_consume_token(APOSTROPHE);
      break;
    default:
      jj_la1[13] = jj_gen;
      jj_consume_token(-1);
      throw new ParseException();
    }
  }

  final private boolean jj_2_1(int xla) {
    jj_la = xla; jj_lastpos = jj_scanpos = token;
    try { return !jj_3_1(); }
    catch(LookaheadSuccess ls) { return true; }
    finally { jj_save(0, xla); }
  }

  final private boolean jj_2_2(int xla) {
    jj_la = xla; jj_lastpos = jj_scanpos = token;
    try { return !jj_3_2(); }
    catch(LookaheadSuccess ls) { return true; }
    finally { jj_save(1, xla); }
  }

  final private boolean jj_2_3(int xla) {
    jj_la = xla; jj_lastpos = jj_scanpos = token;
    try { return !jj_3_3(); }
    catch(LookaheadSuccess ls) { return true; }
    finally { jj_save(2, xla); }
  }

  final private boolean jj_3R_24() {
    if (jj_3R_18()) return true;
    return false;
  }

  final private boolean jj_3R_11() {
    Token xsp;
    xsp = jj_scanpos;
    if (jj_scan_token(1)) {
    jj_scanpos = xsp;
12 下一页
💿 文件大小 146 K
👤 上传用户 beixinning
📂 所属分类其他
🏷️ 相关标签

#bug #数据 #正
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -