📄 cstandardtokenizer.jj

📁 一个jsp写的bbs
💻 JJ
字号:
/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

 /* to work with CWordTokenizer, StandardTokenizer has been modified to
    treat CJK same as normal englsih word */
options {
  STATIC = false;
//IGNORE_CASE = true;
//BUILD_PARSER = false;
  UNICODE_INPUT = true;
  USER_CHAR_STREAM = true;
  OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_TOKEN_MANAGER = true;
}
PARSER_BEGIN(CStandardTokenizer)

package org.apache.lucene.analysis.cw;
import java.io.*;
import org.apache.lucene.analysis.standard.FastCharStream;
import org.apache.lucene.analysis.standard.CharStream;
import org.apache.lucene.analysis.standard.ParseException;

/** A grammar-based tokenizer constructed with JavaCC.
 *
 * <p> This should be a good tokenizer for most European-language documents.
 *
 * <p>Many applications have specific tokenizer needs.  If this tokenizer does
 * not suit your application, please consider copying this source code
 * directory to your project and maintaining your own grammar-based tokenizer.
 */
public class CStandardTokenizer extends org.apache.lucene.analysis.Tokenizer {

  /** Constructs a tokenizer for this Reader. */
  public CStandardTokenizer(Reader reader) {
    this(new FastCharStream(reader));
    this.input = reader;
  }
}

PARSER_END(CStandardTokenizer)

TOKEN : {					  // token patterns

  // basic word: a sequence of digits & letters & cjk
  <ALPHANUM: (<LETTER>|<DIGIT>|[
       "\u3040"-"\u318f",
       "\u3300"-"\u337f",
       "\u3400"-"\u3d2d",
       "\u4e00"-"\u9fff",
       "\uf900"-"\ufaff"
      ])+ >

  // internal apostrophes: O'Reilly, you're, O'Reilly's
  // use a post-filter to remove possesives
| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >

  // acronyms: U.S.A., I.B.M., etc.
  // use a post-filter to remove dots
| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >

  // company names like AT&T and Excite@Home.
| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >

  // email addresses
| <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >

  // hostname
| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >

  // floating point, serial, model numbers, ip addresses, etc.
  // every other segment must have at least one digit
| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
       | <HAS_DIGIT> <P> <ALPHANUM>
       | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
       | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
       | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
       | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
        )
  >
| <#P: ("_"|"-"|"/"|"."|",") >
| <#HAS_DIGIT:					  // at least one digit
    (<LETTER>|<DIGIT>)*
    <DIGIT>
    (<LETTER>|<DIGIT>)*
  >

| < #ALPHA: (<LETTER>)+>
| < #LETTER:					  // unicode letters
      [
       "\u0041"-"\u005a",
       "\u0061"-"\u007a",
       "\u00c0"-"\u00d6",
       "\u00d8"-"\u00f6",
       "\u00f8"-"\u00ff",
       "\u0100"-"\u1fff"
      ]
  >
| < #DIGIT:					  // unicode digits
      [
       "\u0030"-"\u0039",
       "\u0660"-"\u0669",
       "\u06f0"-"\u06f9",
       "\u0966"-"\u096f",
       "\u09e6"-"\u09ef",
       "\u0a66"-"\u0a6f",
       "\u0ae6"-"\u0aef",
       "\u0b66"-"\u0b6f",
       "\u0be7"-"\u0bef",
       "\u0c66"-"\u0c6f",
       "\u0ce6"-"\u0cef",
       "\u0d66"-"\u0d6f",
       "\u0e50"-"\u0e59",
       "\u0ed0"-"\u0ed9",
       "\u1040"-"\u1049"
      ]
  >
}

SKIP : {					  // skip unrecognized chars
 <NOISE: ~[] >
}

/** Returns the next token in the stream, or null at EOS.
 * <p>The returned token's type is set to an element of {@link
 * CStandardTokenizerConstants#tokenImage}.
 */
org.apache.lucene.analysis.Token next() throws IOException :
{
  Token token = null;
}
{
  ( token = <ALPHANUM> |
    token = <APOSTROPHE> |
    token = <ACRONYM> |
    token = <COMPANY> |
    token = <EMAIL> |
    token = <HOST> |
    token = <NUM> |
    token = <EOF>
   )
    {
      if (token.kind == EOF) {
	return null;
      } else {
	return
	  new org.apache.lucene.analysis.Token(token.image,
					token.beginColumn,token.endColumn,
					tokenImage[token.kind]);
      }
    }
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -