📄 analyzerutil.java

📁 lucene2.2.0版本
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.apache.lucene.index.memory;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.IOException;import java.io.PrintStream;import java.io.Reader;import java.io.StringReader;import java.util.ArrayList;import java.util.Arrays;import java.util.Comparator;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import java.util.regex.Pattern;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.PorterStemFilter;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenFilter;import org.apache.lucene.analysis.TokenStream;/** * Various fulltext analysis utilities avoiding redundant code in several * classes. *  * @author whoschek.AT.lbl.DOT.gov */public class AnalyzerUtil {    private AnalyzerUtil() {};  /**   * Returns a simple analyzer wrapper that logs all tokens produced by the   * underlying child analyzer to the given log stream (typically System.err);   * Otherwise behaves exactly like the child analyzer, delivering the very   * same tokens; useful for debugging purposes on custom indexing and/or   * querying.   *    * @param child   *            the underlying child analyzer   * @param log   *            the print stream to log to (typically System.err)   * @param logName   *            a name for this logger (typically "log" or similar)   * @return a logging analyzer   */  public static Analyzer getLoggingAnalyzer(final Analyzer child,       final PrintStream log, final String logName) {        if (child == null)       throw new IllegalArgumentException("child analyzer must not be null");    if (log == null)       throw new IllegalArgumentException("logStream must not be null");    return new Analyzer() {      public TokenStream tokenStream(final String fieldName, Reader reader) {        return new TokenFilter(child.tokenStream(fieldName, reader)) {          private int position = -1;                    public Token next() throws IOException {            Token token = input.next(); // from filter super class            log.println(toString(token));            return token;          }                    private String toString(Token token) {            if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";                        position += token.getPositionIncrement();            return "[" + logName + ":" + position + ":" + fieldName + ":"                + token.termText() + ":" + token.startOffset()                + "-" + token.endOffset() + ":" + token.type()                + "]";          }                 };      }    };  }      /**   * Returns an analyzer wrapper that returns at most the first   * <code>maxTokens</code> tokens from the underlying child analyzer,   * ignoring all remaining tokens.   *    * @param child   *            the underlying child analyzer   * @param maxTokens   *            the maximum number of tokens to return from the underlying   *            analyzer (a value of Integer.MAX_VALUE indicates unlimited)   * @return an analyzer wrapper   */  public static Analyzer getMaxTokenAnalyzer(      final Analyzer child, final int maxTokens) {        if (child == null)       throw new IllegalArgumentException("child analyzer must not be null");    if (maxTokens < 0)       throw new IllegalArgumentException("maxTokens must not be negative");    if (maxTokens == Integer.MAX_VALUE)       return child; // no need to wrap      return new Analyzer() {      public TokenStream tokenStream(String fieldName, Reader reader) {        return new TokenFilter(child.tokenStream(fieldName, reader)) {          private int todo = maxTokens;                    public Token next() throws IOException {            return --todo >= 0 ? input.next() : null;          }        };      }    };  }      /**   * Returns an English stemming analyzer that stems tokens from the   * underlying child analyzer according to the Porter stemming algorithm. The   * child analyzer must deliver tokens in lower case for the stemmer to work   * properly.   * <p>   * Background: Stemming reduces token terms to their linguistic root form   * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to   * "famili", as well as "complete" and "completion" to "complet". Note that   * the root form is not necessarily a meaningful word in itself, and that   * this is not a bug but rather a feature, if you lean back and think about   * fuzzy word matching for a bit.   * <p>   * See the Lucene contrib packages for stemmers (and stop words) for German,   * Russian and many more languages.   *    * @param child   *            the underlying child analyzer   * @return an analyzer wrapper   */  public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {        if (child == null)       throw new IllegalArgumentException("child analyzer must not be null");      return new Analyzer() {      public TokenStream tokenStream(String fieldName, Reader reader) {        return new PorterStemFilter(            child.tokenStream(fieldName, reader));//        /* PorterStemFilter and SnowballFilter have the same behaviour, //        but PorterStemFilter is much faster. *///        return new org.apache.lucene.analysis.snowball.SnowballFilter(//            child.tokenStream(fieldName, reader), "English");      }    };  }      /**   * Returns an analyzer wrapper that wraps the underlying child analyzer's   * token stream into a {@link SynonymTokenFilter}.   *    * @param child   *            the underlying child analyzer   * @param synonyms   *            the map used to extract synonyms for terms   * @param maxSynonyms   *            the maximum number of synonym tokens to return per underlying   *            token word (a value of Integer.MAX_VALUE indicates unlimited)   * @return a new analyzer   */  public static Analyzer getSynonymAnalyzer(final Analyzer child,       final SynonymMap synonyms, final int maxSynonyms) {        if (child == null)       throw new IllegalArgumentException("child analyzer must not be null");    if (synonyms == null)      throw new IllegalArgumentException("synonyms must not be null");    if (maxSynonyms < 0)       throw new IllegalArgumentException("maxSynonyms must not be negative");    if (maxSynonyms == 0)      return child; // no need to wrap      return new Analyzer() {      public TokenStream tokenStream(String fieldName, Reader reader) {        return new SynonymTokenFilter(          child.tokenStream(fieldName, reader), synonyms, maxSynonyms);      }    };  }    /**   * Returns an analyzer wrapper that caches all tokens generated by the underlying child analyzer's   * token streams, and delivers those cached tokens on subsequent calls to    * <code>tokenStream(String fieldName, Reader reader)</code>    * if the fieldName has been seen before, altogether ignoring the Reader parameter on cache lookup.   * <p>   * If Analyzer / TokenFilter chains are expensive in terms of I/O or CPU, such caching can    * help improve performance if the same document is added to multiple Lucene indexes,    * because the text analysis phase need not be performed more than once.   * <p>   * Caveats:    * <ul>   * <li>Caching the tokens of large Lucene documents can lead to out of memory exceptions.</li>    * <li>The Token instances delivered by the underlying child analyzer must be immutable.</li>   * <li>The same caching analyzer instance must not be used for more than one document   * because the cache is not keyed on the Reader parameter.</li>   * </ul>   *    * @param child   *            the underlying child analyzer   * @return a new analyzer
12 下一页
💿 文件大小 5913 K
👤 上传用户 jjjjjkkkkjkjkjk
📂 所属分类 Java编程
🏷️ 相关标签

#lucene #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -