📄 memoryindex.java
字号:
package org.apache.lucene.index.memory;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.FieldSelector;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.index.TermDocs;import org.apache.lucene.index.TermEnum;import org.apache.lucene.index.TermFreqVector;import org.apache.lucene.index.TermPositionVector;import org.apache.lucene.index.TermPositions;import org.apache.lucene.search.HitCollector;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.Similarity;import java.io.IOException;import java.io.Serializable;import java.util.Arrays;import java.util.Collection;import java.util.Collections;import java.util.Comparator;import java.util.HashMap;import java.util.Iterator;import java.util.Map;/** * High-performance single-document main memory Apache Lucene fulltext search index. * * <h4>Overview</h4> * * This class is a replacement/substitute for a large subset of * {@link org.apache.lucene.store.RAMDirectory} functionality. It is designed to * enable maximum efficiency for on-the-fly matchmaking combining structured and * fuzzy fulltext search in realtime streaming applications such as Nux XQuery based XML * message queues, publish-subscribe systems for Blogs/newsfeeds, text chat, data acquisition and * distribution systems, application level routers, firewalls, classifiers, etc. * Rather than targetting fulltext search of infrequent queries over huge persistent * data archives (historic search), this class targets fulltext search of huge * numbers of queries over comparatively small transient realtime data (prospective * search). * For example as in * <pre> * float score = search(String text, Query query) * </pre> * <p> * Each instance can hold at most one Lucene "document", with a document containing * zero or more "fields", each field having a name and a fulltext value. The * fulltext value is tokenized (split and transformed) into zero or more index terms * (aka words) on <code>addField()</code>, according to the policy implemented by an * Analyzer. For example, Lucene analyzers can split on whitespace, normalize to lower case * for case insensitivity, ignore common terms with little discriminatory value such as "he", "in", "and" (stop * words), reduce the terms to their natural linguistic root form such as "fishing" * being reduced to "fish" (stemming), resolve synonyms/inflexions/thesauri * (upon indexing and/or querying), etc. For details, see * <a target="_blank" href="http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html">Lucene Analyzer Intro</a>. * <p> * Arbitrary Lucene queries can be run against this class - see <a target="_blank" * href="http://lucene.apache.org/java/docs/queryparsersyntax.html">Lucene Query Syntax</a> * as well as <a target="_blank" * href="http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html">Query Parser Rules</a>. * Note that a Lucene query selects on the field names and associated (indexed) * tokenized terms, not on the original fulltext(s) - the latter are not stored * but rather thrown away immediately after tokenization. * <p> * For some interesting background information on search technology, see Bob Wyman's * <a target="_blank" * href="http://bobwyman.pubsub.com/main/2005/05/mary_hodder_poi.html">Prospective Search</a>, * Jim Gray's * <a target="_blank" href="http://www.acmqueue.org/modules.php?name=Content&pa=showpage&pid=293&page=4"> * A Call to Arms - Custom subscriptions</a>, and Tim Bray's * <a target="_blank" * href="http://www.tbray.org/ongoing/When/200x/2003/07/30/OnSearchTOC">On Search, the Series</a>. * * * <h4>Example Usage</h4> * * <pre> * Analyzer analyzer = PatternAnalyzer.DEFAULT_ANALYZER; * //Analyzer analyzer = new SimpleAnalyzer(); * MemoryIndex index = new MemoryIndex(); * index.addField("content", "Readings about Salmons and other select Alaska fishing Manuals", analyzer); * index.addField("author", "Tales of James", analyzer); * QueryParser parser = new QueryParser("content", analyzer); * float score = index.search(parser.parse("+author:james +salmon~ +fish* manual~")); * if (score > 0.0f) { * System.out.println("it's a match"); * } else { * System.out.println("no match found"); * } * System.out.println("indexData=" + index.toString()); * </pre> * * * <h4>Example XQuery Usage</h4> * * <pre> * (: An XQuery that finds all books authored by James that have something to do with "salmon fishing manuals", sorted by relevance :) * declare namespace lucene = "java:nux.xom.pool.FullTextUtil"; * declare variable $query := "+salmon~ +fish* manual~"; (: any arbitrary Lucene query can go here :) * * for $book in /books/book[author="James" and lucene:match(abstract, $query) > 0.0] * let $score := lucene:match($book/abstract, $query) * order by $score descending * return $book * </pre> * * * <h4>No thread safety guarantees</h4> * * An instance can be queried multiple times with the same or different queries, * but an instance is not thread-safe. If desired use idioms such as: * <pre> * MemoryIndex index = ... * synchronized (index) { * // read and/or write index (i.e. add fields and/or query) * } * </pre> * * * <h4>Performance Notes</h4> * * Internally there's a new data structure geared towards efficient indexing * and searching, plus the necessary support code to seamlessly plug into the Lucene * framework. * <p> * This class performs very well for very small texts (e.g. 10 chars) * as well as for large texts (e.g. 10 MB) and everything in between. * Typically, it is about 10-100 times faster than <code>RAMDirectory</code>. * Note that <code>RAMDirectory</code> has particularly * large efficiency overheads for small to medium sized texts, both in time and space. * Indexing a field with N tokens takes O(N) in the best case, and O(N logN) in the worst * case. Memory consumption is probably larger than for <code>RAMDirectory</code>. * <p> * Example throughput of many simple term queries over a single MemoryIndex: * ~500000 queries/sec on a MacBook Pro, jdk 1.5.0_06, server VM. * As always, your mileage may vary. * <p> * If you're curious about * the whereabouts of bottlenecks, run java 1.5 with the non-perturbing '-server * -agentlib:hprof=cpu=samples,depth=10' flags, then study the trace log and * correlate its hotspot trailer with its call stack headers (see <a * target="_blank" * href="http://java.sun.com/developer/technicalArticles/Programming/HPROF.html"> * hprof tracing </a>). * * @author whoschek.AT.lbl.DOT.gov */public class MemoryIndex { /** info for each field: Map<String fieldName, Info field> */ private final HashMap fields = new HashMap(); /** fields sorted ascending by fieldName; lazily computed on demand */ private transient Map.Entry[] sortedFields; /** pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */ private final int stride; /** Could be made configurable; See {@link Document#setBoost(float)} */ private static final float docBoost = 1.0f; private static final long serialVersionUID = 2782195016849084649L; private static final boolean DEBUG = false; /** * Sorts term entries into ascending order; also works for * Arrays.binarySearch() and Arrays.sort() */ private static final Comparator termComparator = new Comparator() { public int compare(Object o1, Object o2) { if (o1 instanceof Map.Entry) o1 = ((Map.Entry) o1).getKey(); if (o2 instanceof Map.Entry) o2 = ((Map.Entry) o2).getKey(); if (o1 == o2) return 0; return ((String) o1).compareTo((String) o2); } }; /** * Constructs an empty instance. */ public MemoryIndex() { this(false); } /** * Constructs an empty instance that can optionally store the start and end * character offset of each token term in the text. This can be useful for * highlighting of hit locations with the Lucene highlighter package. * Private until the highlighter package matures, so that this can actually * be meaningfully integrated. * * @param storeOffsets * whether or not to store the start and end character offset of * each token term in the text */ private MemoryIndex(boolean storeOffsets) { this.stride = storeOffsets ? 3 : 1; } /** * Convenience method; Tokenizes the given field text and adds the resulting * terms to the index; Equivalent to adding an indexed non-keyword Lucene * {@link org.apache.lucene.document.Field} that is * {@link org.apache.lucene.document.Field.Index#TOKENIZED tokenized}, * {@link org.apache.lucene.document.Field.Store#NO not stored}, * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions} (or * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions and offsets}), * * @param fieldName * a name to be associated with the text * @param text * the text to tokenize and index. * @param analyzer * the analyzer to use for tokenization */ public void addField(String fieldName, String text, Analyzer analyzer) { if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (text == null) throw new IllegalArgumentException("text must not be null"); if (analyzer == null) throw new IllegalArgumentException("analyzer must not be null"); TokenStream stream; if (analyzer instanceof PatternAnalyzer) { stream = ((PatternAnalyzer) analyzer).tokenStream(fieldName, text); } else { stream = analyzer.tokenStream(fieldName, new PatternAnalyzer.FastStringReader(text)); } addField(fieldName, stream); } /** * Convenience method; Creates and returns a token stream that generates a * token for each keyword in the given collection, "as is", without any * transforming text analysis. The resulting token stream can be fed into * {@link #addField(String, TokenStream)}, perhaps wrapped into another * {@link org.apache.lucene.analysis.TokenFilter}, as desired. * * @param keywords * the keywords to generate tokens for * @return the corresponding token stream */ public TokenStream keywordTokenStream(final Collection keywords) { // TODO: deprecate & move this method into AnalyzerUtil? if (keywords == null) throw new IllegalArgumentException("keywords must not be null"); return new TokenStream() { private Iterator iter = keywords.iterator(); private int start = 0; public Token next() { if (!iter.hasNext()) return null; Object obj = iter.next(); if (obj == null) throw new IllegalArgumentException("keyword must not be null"); String term = obj.toString(); Token token = new Token(term, start, start + term.length()); start += term.length() + 1; // separate words by 1 (blank) character return token; } }; } /** * Equivalent to <code>addField(fieldName, stream, 1.0f)</code>. * * @param fieldName * a name to be associated with the text * @param stream * the token stream to retrieve tokens from */ public void addField(String fieldName, TokenStream stream) { addField(fieldName, stream, 1.0f); } /** * Iterates over the given token stream and adds the resulting terms to the index; * Equivalent to adding a tokenized, indexed, termVectorStored, unstored, * Lucene {@link org.apache.lucene.document.Field}. * Finally closes the token stream. Note that untokenized keywords can be added with this method via * {@link #keywordTokenStream(Collection)}, the Lucene contrib <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName * a name to be associated with the text * @param stream * the token stream to retrieve tokens from. * @param boost * the boost factor for hits for this field * @see Field#setBoost(float) */ public void addField(String fieldName, TokenStream stream, float boost) { /* * Note that this method signature avoids having a user call new * o.a.l.d.Field(...) which would be much too expensive due to the * String.intern() usage of that class. * * More often than not, String.intern() leads to serious performance * degradations rather than improvements! If you're curious why, check * out the JDK's native code, see how it oscillates multiple times back * and forth between Java code and native code on each intern() call, * only to end up using a plain vanilla java.util.HashMap on the Java * heap for it's interned strings! String.equals() has a small cost * compared to String.intern(), trust me. Application level interning * (e.g. a HashMap per Directory/Index) typically leads to better * solutions than frequent hidden low-level calls to String.intern(). * * Perhaps with some luck, Lucene's Field.java (and Term.java) and * cousins could be fixed to not use String.intern(). Sigh :-( */ try { if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (stream == null) throw new IllegalArgumentException("token stream must not be null"); if (boost <= 0.0f) throw new IllegalArgumentException("boost factor must be greater than 0.0"); if (fields.get(fieldName) != null) throw new IllegalArgumentException("field must not be added more than once"); HashMap terms = new HashMap(); int numTokens = 0; int pos = -1; Token token; while ((token = stream.next()) != null) { String term = token.termText(); if (term.length() == 0) continue; // nothing to do// if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; pos += token.getPositionIncrement(); ArrayIntList positions = (ArrayIntList) terms.get(term); if (positions == null) { // term not seen before positions = new ArrayIntList(stride); terms.put(term, positions); } if (stride == 1) { positions.add(pos); } else { positions.add(pos, token.startOffset(), token.endOffset()); } } // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { boost = boost * docBoost; // see DocumentWriter.addDocument(...) fields.put(fieldName, new Info(terms, numTokens, boost)); sortedFields = null; // invalidate sorted view, if any } } catch (IOException e) { // can never happen throw new RuntimeException(e); } finally { try { if (stream != null) stream.close(); } catch (IOException e2) { throw new RuntimeException(e2); } } } /** * Creates and returns a searcher that can be used to execute arbitrary * Lucene queries and to collect the resulting query results as hits. * * @return a searcher */ public IndexSearcher createSearcher() { MemoryIndexReader reader = new MemoryIndexReader(); IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !! reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity() return searcher; } /** * Convenience method that efficiently returns the relevance score by * matching this index against the given Lucene query expression. * * @param query * an arbitrary Lucene query to run against this index * @return the relevance score of the matchmaking; A number in the range * [0.0 .. 1.0], with 0.0 indicating no match. The higher the number
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -