📄 analyzerutil.java
字号:
*/ public static Analyzer getTokenCachingAnalyzer(final Analyzer child) { if (child == null) throw new IllegalArgumentException("child analyzer must not be null"); return new Analyzer() { private final HashMap cache = new HashMap(); public TokenStream tokenStream(String fieldName, Reader reader) { final ArrayList tokens = (ArrayList) cache.get(fieldName); if (tokens == null) { // not yet cached final ArrayList tokens2 = new ArrayList(); TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) { public Token next() throws IOException { Token token = input.next(); // from filter super class if (token != null) tokens2.add(token); return token; } }; cache.put(fieldName, tokens2); return tokenStream; } else { // already cached return new TokenStream() { private Iterator iter = tokens.iterator(); public Token next() { if (!iter.hasNext()) return null; return (Token) iter.next(); } }; } } }; } /** * Returns (frequency:term) pairs for the top N distinct terms (aka words), * sorted descending by frequency (and ascending by term, if tied). * <p> * Example XQuery: * <pre> * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil"; * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer"; * * for $pair in util:get-most-frequent-terms( * analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10) * return <word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/> * </pre> * * @param analyzer * the analyzer to use for splitting text into terms (aka words) * @param text * the text to analyze * @param limit * the maximum number of pairs to return; zero indicates * "as many as possible". * @return an array of (frequency:term) pairs in the form of (freq0:term0, * freq1:term1, ..., freqN:termN). Each pair is a single string * separated by a ':' delimiter. */ public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) { if (analyzer == null) throw new IllegalArgumentException("analyzer must not be null"); if (text == null) throw new IllegalArgumentException("text must not be null"); if (limit <= 0) limit = Integer.MAX_VALUE; // compute frequencies of distinct terms HashMap map = new HashMap(); TokenStream stream = analyzer.tokenStream("", new StringReader(text)); try { Token token; while ((token = stream.next()) != null) { MutableInteger freq = (MutableInteger) map.get(token.termText()); if (freq == null) { freq = new MutableInteger(1); map.put(token.termText(), freq); } else { freq.setValue(freq.intValue() + 1); } } } catch (IOException e) { throw new RuntimeException(e); } finally { try { stream.close(); } catch (IOException e2) { throw new RuntimeException(e2); } } // sort by frequency, text Map.Entry[] entries = new Map.Entry[map.size()]; map.entrySet().toArray(entries); Arrays.sort(entries, new Comparator() { public int compare(Object o1, Object o2) { Map.Entry e1 = (Map.Entry) o1; Map.Entry e2 = (Map.Entry) o2; int f1 = ((MutableInteger) e1.getValue()).intValue(); int f2 = ((MutableInteger) e2.getValue()).intValue(); if (f2 - f1 != 0) return f2 - f1; String s1 = (String) e1.getKey(); String s2 = (String) e2.getKey(); return s1.compareTo(s2); } }); // return top N entries int size = Math.min(limit, entries.length); String[] pairs = new String[size]; for (int i=0; i < size; i++) { pairs[i] = entries[i].getValue() + ":" + entries[i].getKey(); } return pairs; } private static final class MutableInteger { private int value; public MutableInteger(int value) { this.value = value; } public int intValue() { return value; } public void setValue(int value) { this.value = value; } public String toString() { return String.valueOf(value); } }; // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/ /** (Line terminator followed by zero or more whitespace) two or more times */ private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}"); /** * Returns at most the first N paragraphs of the given text. Delimiting * characters are excluded from the results. Each returned paragraph is * whitespace-trimmed via String.trim(), potentially an empty string. * * @param text * the text to tokenize into paragraphs * @param limit * the maximum number of paragraphs to return; zero indicates "as * many as possible". * @return the first N paragraphs */ public static String[] getParagraphs(String text, int limit) { return tokenize(PARAGRAPHS, text, limit); } private static String[] tokenize(Pattern pattern, String text, int limit) { String[] tokens = pattern.split(text, limit); for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim(); return tokens; } // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.') /** Divides text into sentences; Includes inverted spanish exclamation and question mark */ private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+"); /** * Returns at most the first N sentences of the given text. Delimiting * characters are excluded from the results. Each returned sentence is * whitespace-trimmed via String.trim(), potentially an empty string. * * @param text * the text to tokenize into sentences * @param limit * the maximum number of sentences to return; zero indicates "as * many as possible". * @return the first N sentences */ public static String[] getSentences(String text, int limit) {// return tokenize(SENTENCES, text, limit); // equivalent but slower int len = text.length(); if (len == 0) return new String[] { text }; if (limit <= 0) limit = Integer.MAX_VALUE; // average sentence length heuristic String[] tokens = new String[Math.min(limit, 1 + len/40)]; int size = 0; int i = 0; while (i < len && size < limit) { // scan to end of current sentence int start = i; while (i < len && !isSentenceSeparator(text.charAt(i))) i++; if (size == tokens.length) { // grow array String[] tmp = new String[tokens.length << 1]; System.arraycopy(tokens, 0, tmp, 0, size); tokens = tmp; } // add sentence (potentially empty) tokens[size++] = text.substring(start, i).trim(); // scan to beginning of next sentence while (i < len && isSentenceSeparator(text.charAt(i))) i++; } if (size == tokens.length) return tokens; String[] results = new String[size]; System.arraycopy(tokens, 0, results, 0, size); return results; } private static boolean isSentenceSeparator(char c) { // regex [!\\.\\?\\xA1\\xBF] switch (c) { case '!': return true; case '.': return true; case '?': return true; case 0xA1: return true; // spanish inverted exclamation mark case 0xBF: return true; // spanish inverted question mark default: return false; } } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -