📄 patternanalyzer.java
字号:
package org.apache.lucene.index.memory;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import java.io.Reader;import java.io.StringReader;import java.util.Arrays;import java.util.HashSet;import java.util.Locale;import java.util.Set;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.StopAnalyzer;import org.apache.lucene.analysis.StopFilter;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;/** * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern} * (with behaviour identical to {@link String#split(String)}), * and that combines the functionality of * {@link org.apache.lucene.analysis.LetterTokenizer}, * {@link org.apache.lucene.analysis.LowerCaseTokenizer}, * {@link org.apache.lucene.analysis.WhitespaceTokenizer}, * {@link org.apache.lucene.analysis.StopFilter} into a single efficient * multi-purpose class. * <p> * If you are unsure how exactly a regular expression should look like, consider * prototyping by simply trying various expressions on some test texts via * {@link String#split(String)}. Once you are satisfied, give that regex to * PatternAnalyzer. Also see <a target="_blank" * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>. * <p> * This class can be considerably faster than the "normal" Lucene tokenizers. * It can also serve as a building block in a compound Lucene * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this * stemming example: * <pre> * PatternAnalyzer pat = ... * TokenStream tokenStream = new SnowballFilter( * pat.tokenStream("content", "James is running round in the woods"), * "English")); * </pre> * * @author whoschek.AT.lbl.DOT.gov */public class PatternAnalyzer extends Analyzer { /** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */ public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+"); /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */ public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+"); private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] { "a", "about", "above", "across", "adj", "after", "afterwards", "again", "against", "albeit", "all", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anywhere", "are", "around", "as", "at", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "co", "could", "down", "during", "each", "eg", "either", "else", "elsewhere", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "first", "for", "former", "formerly", "from", "further", "had", "has", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", "latter", "latterly", "least", "less", "ltd", "many", "may", "me", "meanwhile", "might", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "namely", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", "rather", "s", "same", "seem", "seemed", "seeming", "seems", "several", "she", "should", "since", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "t", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefor", "therein", "thereupon", "these", "they", "this", "those", "though", "through", "throughout", "thru", "thus", "to", "together", "too", "toward", "towards", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereafter", "whereas", "whereat", "whereby", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "with", "within", "without", "would", "xsubj", "xcal", "xauthor", "xother ", "xnote", "yet", "you", "your", "yours", "yourself", "yourselves"}); /** * A lower-casing word analyzer with English stop words (can be shared * freely across threads without harm); global per class loader. */ public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer( NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS)); /** * A lower-casing word analyzer with <b>extended </b> English stop words * (can be shared freely across threads without harm); global per class * loader. The stop words are borrowed from * http://thomas.loc.gov/home/stopwords.html, see * http://thomas.loc.gov/home/all.about.inquery.html */ public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer( NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS); private final Pattern pattern; private final boolean toLowerCase; private final Set stopWords; /** * Constructs a new instance with the given parameters. * * @param pattern * a regular expression delimiting tokens * @param toLowerCase * if <code>true</code> returns tokens after applying * String.toLowerCase() * @param stopWords * if non-null, ignores all tokens that are contained in the * given stop set (after previously having applied toLowerCase() * if applicable). For example, created via * {@link StopFilter#makeStopSet(String[])}and/or * {@link org.apache.lucene.analysis.WordlistLoader}as in * <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code> * or <a href="http://www.unine.ch/info/clef/">other stop words * lists </a>. */ public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) { if (pattern == null) throw new IllegalArgumentException("pattern must not be null"); if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN; else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN; if (stopWords != null && stopWords.size() == 0) stopWords = null; this.pattern = pattern; this.toLowerCase = toLowerCase; this.stopWords = stopWords; } /** * Creates a token stream that tokenizes the given string into token terms * (aka words). * * @param fieldName * the name of the field to tokenize (currently ignored). * @param text * the string to tokenize * @return a new token stream */ public TokenStream tokenStream(String fieldName, String text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (text == null) throw new IllegalArgumentException("text must not be null"); TokenStream stream; if (pattern == NON_WORD_PATTERN) { // fast path stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); } else if (pattern == WHITESPACE_PATTERN) { // fast path stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); } else { stream = new PatternTokenizer(text, pattern, toLowerCase); if (stopWords != null) stream = new StopFilter(stream, stopWords); } return stream; } /** * Creates a token stream that tokenizes all the text in the given Reader; * This implementation forwards to <code>tokenStream(String, String)</code> and is * less efficient than <code>tokenStream(String, String)</code>. * * @param fieldName * the name of the field to tokenize (currently ignored). * @param reader * the reader delivering the text * @return a new token stream */ public TokenStream tokenStream(String fieldName, Reader reader) { if (reader instanceof FastStringReader) { // fast path return tokenStream(fieldName, ((FastStringReader)reader).getString()); } try { String text = toString(reader); return tokenStream(fieldName, text); } catch (IOException e) { throw new RuntimeException(e); } } /** * Indicates whether some other object is "equal to" this one. * * @param other * the reference object with which to compare. * @return true if equal, false otherwise */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -