📄 analyzerutil.java

📁 lucene2.2.0版本
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
   */  public static Analyzer getTokenCachingAnalyzer(final Analyzer child) {    if (child == null)      throw new IllegalArgumentException("child analyzer must not be null");    return new Analyzer() {      private final HashMap cache = new HashMap();      public TokenStream tokenStream(String fieldName, Reader reader) {        final ArrayList tokens = (ArrayList) cache.get(fieldName);        if (tokens == null) { // not yet cached          final ArrayList tokens2 = new ArrayList();          TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) {            public Token next() throws IOException {              Token token = input.next(); // from filter super class              if (token != null) tokens2.add(token);              return token;            }          };                    cache.put(fieldName, tokens2);          return tokenStream;        } else { // already cached          return new TokenStream() {            private Iterator iter = tokens.iterator();            public Token next() {              if (!iter.hasNext()) return null;              return (Token) iter.next();            }          };        }      }    };  }          /**   * Returns (frequency:term) pairs for the top N distinct terms (aka words),   * sorted descending by frequency (and ascending by term, if tied).   * <p>   * Example XQuery:   * <pre>   * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";   * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";   *    * for $pair in util:get-most-frequent-terms(   *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)   * return &lt;word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>   * </pre>   *    * @param analyzer   *            the analyzer to use for splitting text into terms (aka words)   * @param text   *            the text to analyze   * @param limit   *            the maximum number of pairs to return; zero indicates    *            "as many as possible".   * @return an array of (frequency:term) pairs in the form of (freq0:term0,   *         freq1:term1, ..., freqN:termN). Each pair is a single string   *         separated by a ':' delimiter.   */  public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {    if (analyzer == null)       throw new IllegalArgumentException("analyzer must not be null");    if (text == null)       throw new IllegalArgumentException("text must not be null");    if (limit <= 0) limit = Integer.MAX_VALUE;        // compute frequencies of distinct terms    HashMap map = new HashMap();    TokenStream stream = analyzer.tokenStream("", new StringReader(text));    try {      Token token;      while ((token = stream.next()) != null) {        MutableInteger freq = (MutableInteger) map.get(token.termText());        if (freq == null) {          freq = new MutableInteger(1);          map.put(token.termText(), freq);        } else {          freq.setValue(freq.intValue() + 1);        }      }    } catch (IOException e) {      throw new RuntimeException(e);    } finally {      try {        stream.close();      } catch (IOException e2) {        throw new RuntimeException(e2);      }    }        // sort by frequency, text    Map.Entry[] entries = new Map.Entry[map.size()];    map.entrySet().toArray(entries);    Arrays.sort(entries, new Comparator() {      public int compare(Object o1, Object o2) {        Map.Entry e1 = (Map.Entry) o1;        Map.Entry e2 = (Map.Entry) o2;        int f1 = ((MutableInteger) e1.getValue()).intValue();        int f2 = ((MutableInteger) e2.getValue()).intValue();        if (f2 - f1 != 0) return f2 - f1;        String s1 = (String) e1.getKey();        String s2 = (String) e2.getKey();        return s1.compareTo(s2);      }    });        // return top N entries    int size = Math.min(limit, entries.length);    String[] pairs = new String[size];    for (int i=0; i < size; i++) {      pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();    }    return pairs;  }    private static final class MutableInteger {    private int value;    public MutableInteger(int value) { this.value = value; }    public int intValue() { return value; }    public void setValue(int value) { this.value = value; }    public String toString() { return String.valueOf(value); }  };        // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/  /** (Line terminator followed by zero or more whitespace) two or more times */  private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");    /**   * Returns at most the first N paragraphs of the given text. Delimiting   * characters are excluded from the results. Each returned paragraph is   * whitespace-trimmed via String.trim(), potentially an empty string.   *    * @param text   *            the text to tokenize into paragraphs   * @param limit   *            the maximum number of paragraphs to return; zero indicates "as   *            many as possible".   * @return the first N paragraphs   */  public static String[] getParagraphs(String text, int limit) {    return tokenize(PARAGRAPHS, text, limit);  }      private static String[] tokenize(Pattern pattern, String text, int limit) {    String[] tokens = pattern.split(text, limit);    for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();    return tokens;  }      // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')  /** Divides text into sentences; Includes inverted spanish exclamation and question mark */  private static final Pattern SENTENCES  = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");  /**   * Returns at most the first N sentences of the given text. Delimiting   * characters are excluded from the results. Each returned sentence is   * whitespace-trimmed via String.trim(), potentially an empty string.   *    * @param text   *            the text to tokenize into sentences   * @param limit   *            the maximum number of sentences to return; zero indicates "as   *            many as possible".   * @return the first N sentences   */  public static String[] getSentences(String text, int limit) {//    return tokenize(SENTENCES, text, limit); // equivalent but slower    int len = text.length();    if (len == 0) return new String[] { text };    if (limit <= 0) limit = Integer.MAX_VALUE;        // average sentence length heuristic    String[] tokens = new String[Math.min(limit, 1 + len/40)];    int size = 0;    int i = 0;        while (i < len && size < limit) {            // scan to end of current sentence      int start = i;      while (i < len && !isSentenceSeparator(text.charAt(i))) i++;            if (size == tokens.length) { // grow array        String[] tmp = new String[tokens.length << 1];        System.arraycopy(tokens, 0, tmp, 0, size);        tokens = tmp;      }      // add sentence (potentially empty)      tokens[size++] = text.substring(start, i).trim();      // scan to beginning of next sentence      while (i < len && isSentenceSeparator(text.charAt(i))) i++;    }        if (size == tokens.length) return tokens;    String[] results = new String[size];    System.arraycopy(tokens, 0, results, 0, size);    return results;  }  private static boolean isSentenceSeparator(char c) {    // regex [!\\.\\?\\xA1\\xBF]    switch (c) {      case '!': return true;      case '.': return true;      case '?': return true;      case 0xA1: return true; // spanish inverted exclamation mark      case 0xBF: return true; // spanish inverted question mark      default: return false;    }     }  }
上一页 12
💿 文件大小 5913 K
👤 上传用户 jjjjjkkkkjkjkjk
📂 所属分类 Java编程
🏷️ 相关标签

#lucene #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -