📄 standardtokenizerimpl.jflex
字号:
package org.apache.lucene.analysis.standard;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Token;%%%class StandardTokenizerImpl%unicode%integer%function getNextToken%pack%char%{public static final int ALPHANUM = StandardTokenizer.ALPHANUM;public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;public static final int ACRONYM = StandardTokenizer.ACRONYM;public static final int COMPANY = StandardTokenizer.COMPANY;public static final int EMAIL = StandardTokenizer.EMAIL;public static final int HOST = StandardTokenizer.HOST;public static final int NUM = StandardTokenizer.NUM;public static final int CJ = StandardTokenizer.CJ;/** * @deprecated this solves a bug where HOSTs that end with '.' are identified * as ACRONYMs. It is deprecated and will be removed in the next * release. */public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;public final int yychar(){ return yychar;}/** * Fills Lucene token with the current token text. */final void getText(Token t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);}%}// basic word: a sequence of digits & lettersALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+// internal apostrophes: O'Reilly, you're, O'Reilly's// use a post-filter to remove possesivesAPOSTROPHE = {ALPHA} ("'" {ALPHA})+// acronyms: U.S.A., I.B.M., etc.// use a post-filter to remove dotsACRONYM = {LETTER} "." ({LETTER} ".")+ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+// company names like AT&T and Excite@Home.COMPANY = {ALPHA} ("&"|"@") {ALPHA}// email addressesEMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+// hostnameHOST = {ALPHANUM} ((".") {ALPHANUM})+// floating point, serial, model numbers, ip addresses, etc.// every other segment must have at least one digitNUM = ({ALPHANUM} {P} {HAS_DIGIT} | {HAS_DIGIT} {P} {ALPHANUM} | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)// punctuationP = ("_"|"-"|"/"|"."|",")// at least one digitHAS_DIGIT = ({LETTER}|{DIGIT})* {DIGIT} ({LETTER}|{DIGIT})*ALPHA = ({LETTER})+LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]KOREAN = [\uac00-\ud7af\u1100-\u11ff]// Chinese, JapaneseCJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]WHITESPACE = \r\n | [ \r\n\t\f]%%{ALPHANUM} { return ALPHANUM; }{APOSTROPHE} { return APOSTROPHE; }{ACRONYM} { return ACRONYM; }{COMPANY} { return COMPANY; }{EMAIL} { return EMAIL; }{HOST} { return HOST; }{NUM} { return NUM; }{CJ} { return CJ; }{ACRONYM_DEP} { return ACRONYM_DEP; }/** Ignore the rest */. | {WHITESPACE} { /* ignore */ }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -