📄 memoryindex.java

📁 lucene2.2.0版本
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
package org.apache.lucene.index.memory;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.FieldSelector;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.index.TermDocs;import org.apache.lucene.index.TermEnum;import org.apache.lucene.index.TermFreqVector;import org.apache.lucene.index.TermPositionVector;import org.apache.lucene.index.TermPositions;import org.apache.lucene.search.HitCollector;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.Searcher;import org.apache.lucene.search.Similarity;import java.io.IOException;import java.io.Serializable;import java.util.Arrays;import java.util.Collection;import java.util.Collections;import java.util.Comparator;import java.util.HashMap;import java.util.Iterator;import java.util.Map;/** * High-performance single-document main memory Apache Lucene fulltext search index.  *  * <h4>Overview</h4> *  * This class is a replacement/substitute for a large subset of * {@link org.apache.lucene.store.RAMDirectory} functionality. It is designed to * enable maximum efficiency for on-the-fly matchmaking combining structured and  * fuzzy fulltext search in realtime streaming applications such as Nux XQuery based XML  * message queues, publish-subscribe systems for Blogs/newsfeeds, text chat, data acquisition and  * distribution systems, application level routers, firewalls, classifiers, etc.  * Rather than targetting fulltext search of infrequent queries over huge persistent  * data archives (historic search), this class targets fulltext search of huge  * numbers of queries over comparatively small transient realtime data (prospective  * search).  * For example as in  * <pre> * float score = search(String text, Query query) * </pre> * <p> * Each instance can hold at most one Lucene "document", with a document containing * zero or more "fields", each field having a name and a fulltext value. The * fulltext value is tokenized (split and transformed) into zero or more index terms  * (aka words) on <code>addField()</code>, according to the policy implemented by an * Analyzer. For example, Lucene analyzers can split on whitespace, normalize to lower case * for case insensitivity, ignore common terms with little discriminatory value such as "he", "in", "and" (stop * words), reduce the terms to their natural linguistic root form such as "fishing" * being reduced to "fish" (stemming), resolve synonyms/inflexions/thesauri  * (upon indexing and/or querying), etc. For details, see * <a target="_blank" href="http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html">Lucene Analyzer Intro</a>. * <p> * Arbitrary Lucene queries can be run against this class - see <a target="_blank"  * href="http://lucene.apache.org/java/docs/queryparsersyntax.html">Lucene Query Syntax</a> * as well as <a target="_blank"  * href="http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html">Query Parser Rules</a>. * Note that a Lucene query selects on the field names and associated (indexed)  * tokenized terms, not on the original fulltext(s) - the latter are not stored  * but rather thrown away immediately after tokenization. * <p> * For some interesting background information on search technology, see Bob Wyman's * <a target="_blank"  * href="http://bobwyman.pubsub.com/main/2005/05/mary_hodder_poi.html">Prospective Search</a>,  * Jim Gray's * <a target="_blank" href="http://www.acmqueue.org/modules.php?name=Content&pa=showpage&pid=293&page=4"> * A Call to Arms - Custom subscriptions</a>, and Tim Bray's * <a target="_blank"  * href="http://www.tbray.org/ongoing/When/200x/2003/07/30/OnSearchTOC">On Search, the Series</a>. *  *  * <h4>Example Usage</h4>  *  * <pre> * Analyzer analyzer = PatternAnalyzer.DEFAULT_ANALYZER; * //Analyzer analyzer = new SimpleAnalyzer(); * MemoryIndex index = new MemoryIndex(); * index.addField("content", "Readings about Salmons and other select Alaska fishing Manuals", analyzer); * index.addField("author", "Tales of James", analyzer); * QueryParser parser = new QueryParser("content", analyzer); * float score = index.search(parser.parse("+author:james +salmon~ +fish* manual~")); * if (score &gt; 0.0f) { *     System.out.println("it's a match"); * } else { *     System.out.println("no match found"); * } * System.out.println("indexData=" + index.toString()); * </pre> *  *  * <h4>Example XQuery Usage</h4>  *  * <pre> * (: An XQuery that finds all books authored by James that have something to do with "salmon fishing manuals", sorted by relevance :) * declare namespace lucene = "java:nux.xom.pool.FullTextUtil"; * declare variable $query := "+salmon~ +fish* manual~"; (: any arbitrary Lucene query can go here :) *  * for $book in /books/book[author="James" and lucene:match(abstract, $query) > 0.0] * let $score := lucene:match($book/abstract, $query) * order by $score descending * return $book * </pre> *  *  * <h4>No thread safety guarantees</h4> *  * An instance can be queried multiple times with the same or different queries, * but an instance is not thread-safe. If desired use idioms such as: * <pre> * MemoryIndex index = ... * synchronized (index) { *    // read and/or write index (i.e. add fields and/or query) * }  * </pre> *  *  * <h4>Performance Notes</h4> *  * Internally there's a new data structure geared towards efficient indexing  * and searching, plus the necessary support code to seamlessly plug into the Lucene  * framework. * <p> * This class performs very well for very small texts (e.g. 10 chars)  * as well as for large texts (e.g. 10 MB) and everything in between.  * Typically, it is about 10-100 times faster than <code>RAMDirectory</code>. * Note that <code>RAMDirectory</code> has particularly  * large efficiency overheads for small to medium sized texts, both in time and space. * Indexing a field with N tokens takes O(N) in the best case, and O(N logN) in the worst  * case. Memory consumption is probably larger than for <code>RAMDirectory</code>. * <p> * Example throughput of many simple term queries over a single MemoryIndex:  * ~500000 queries/sec on a MacBook Pro, jdk 1.5.0_06, server VM.  * As always, your mileage may vary. * <p> * If you're curious about * the whereabouts of bottlenecks, run java 1.5 with the non-perturbing '-server * -agentlib:hprof=cpu=samples,depth=10' flags, then study the trace log and * correlate its hotspot trailer with its call stack headers (see <a * target="_blank" * href="http://java.sun.com/developer/technicalArticles/Programming/HPROF.html"> * hprof tracing </a>). *  * @author whoschek.AT.lbl.DOT.gov */public class MemoryIndex {  /** info for each field: Map<String fieldName, Info field> */  private final HashMap fields = new HashMap();    /** fields sorted ascending by fieldName; lazily computed on demand */  private transient Map.Entry[] sortedFields;     /** pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */  private final int stride;    /** Could be made configurable; See {@link Document#setBoost(float)} */  private static final float docBoost = 1.0f;    private static final long serialVersionUID = 2782195016849084649L;  private static final boolean DEBUG = false;    /**   * Sorts term entries into ascending order; also works for   * Arrays.binarySearch() and Arrays.sort()   */  private static final Comparator termComparator = new Comparator() {    public int compare(Object o1, Object o2) {      if (o1 instanceof Map.Entry) o1 = ((Map.Entry) o1).getKey();      if (o2 instanceof Map.Entry) o2 = ((Map.Entry) o2).getKey();      if (o1 == o2) return 0;      return ((String) o1).compareTo((String) o2);    }  };  /**   * Constructs an empty instance.   */  public MemoryIndex() {    this(false);  }    /**   * Constructs an empty instance that can optionally store the start and end   * character offset of each token term in the text. This can be useful for   * highlighting of hit locations with the Lucene highlighter package.   * Private until the highlighter package matures, so that this can actually   * be meaningfully integrated.   *    * @param storeOffsets   *            whether or not to store the start and end character offset of   *            each token term in the text   */  private MemoryIndex(boolean storeOffsets) {    this.stride = storeOffsets ? 3 : 1;  }    /**   * Convenience method; Tokenizes the given field text and adds the resulting   * terms to the index; Equivalent to adding an indexed non-keyword Lucene   * {@link org.apache.lucene.document.Field} that is   * {@link org.apache.lucene.document.Field.Index#TOKENIZED tokenized},   * {@link org.apache.lucene.document.Field.Store#NO not stored},   * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions} (or   * {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS termVectorStored with positions and offsets}),   *    * @param fieldName   *            a name to be associated with the text   * @param text   *            the text to tokenize and index.   * @param analyzer   *            the analyzer to use for tokenization   */  public void addField(String fieldName, String text, Analyzer analyzer) {    if (fieldName == null)      throw new IllegalArgumentException("fieldName must not be null");    if (text == null)      throw new IllegalArgumentException("text must not be null");    if (analyzer == null)      throw new IllegalArgumentException("analyzer must not be null");        TokenStream stream;    if (analyzer instanceof PatternAnalyzer) {      stream = ((PatternAnalyzer) analyzer).tokenStream(fieldName, text);    } else {      stream = analyzer.tokenStream(fieldName,           new PatternAnalyzer.FastStringReader(text));    }    addField(fieldName, stream);  }    /**   * Convenience method; Creates and returns a token stream that generates a   * token for each keyword in the given collection, "as is", without any   * transforming text analysis. The resulting token stream can be fed into   * {@link #addField(String, TokenStream)}, perhaps wrapped into another   * {@link org.apache.lucene.analysis.TokenFilter}, as desired.   *    * @param keywords   *            the keywords to generate tokens for   * @return the corresponding token stream   */  public TokenStream keywordTokenStream(final Collection keywords) {    // TODO: deprecate & move this method into AnalyzerUtil?    if (keywords == null)      throw new IllegalArgumentException("keywords must not be null");        return new TokenStream() {      private Iterator iter = keywords.iterator();      private int start = 0;      public Token next() {        if (!iter.hasNext()) return null;                Object obj = iter.next();        if (obj == null)           throw new IllegalArgumentException("keyword must not be null");                String term = obj.toString();        Token token = new Token(term, start, start + term.length());        start += term.length() + 1; // separate words by 1 (blank) character        return token;      }    };  }    /**   * Equivalent to <code>addField(fieldName, stream, 1.0f)</code>.   *    * @param fieldName   *            a name to be associated with the text   * @param stream   *            the token stream to retrieve tokens from   */  public void addField(String fieldName, TokenStream stream) {    addField(fieldName, stream, 1.0f);  }  /**   * Iterates over the given token stream and adds the resulting terms to the index;   * Equivalent to adding a tokenized, indexed, termVectorStored, unstored,   * Lucene {@link org.apache.lucene.document.Field}.   * Finally closes the token stream. Note that untokenized keywords can be added with this method via    * {@link #keywordTokenStream(Collection)}, the Lucene contrib <code>KeywordTokenizer</code> or similar utilities.   *    * @param fieldName   *            a name to be associated with the text   * @param stream   *            the token stream to retrieve tokens from.   * @param boost   *            the boost factor for hits for this field   * @see Field#setBoost(float)   */  public void addField(String fieldName, TokenStream stream, float boost) {    /*     * Note that this method signature avoids having a user call new     * o.a.l.d.Field(...) which would be much too expensive due to the     * String.intern() usage of that class.     *      * More often than not, String.intern() leads to serious performance     * degradations rather than improvements! If you're curious why, check     * out the JDK's native code, see how it oscillates multiple times back     * and forth between Java code and native code on each intern() call,     * only to end up using a plain vanilla java.util.HashMap on the Java     * heap for it's interned strings! String.equals() has a small cost     * compared to String.intern(), trust me. Application level interning     * (e.g. a HashMap per Directory/Index) typically leads to better     * solutions than frequent hidden low-level calls to String.intern().     *      * Perhaps with some luck, Lucene's Field.java (and Term.java) and     * cousins could be fixed to not use String.intern(). Sigh :-(     */    try {      if (fieldName == null)        throw new IllegalArgumentException("fieldName must not be null");      if (stream == null)          throw new IllegalArgumentException("token stream must not be null");      if (boost <= 0.0f)          throw new IllegalArgumentException("boost factor must be greater than 0.0");      if (fields.get(fieldName) != null)        throw new IllegalArgumentException("field must not be added more than once");            HashMap terms = new HashMap();      int numTokens = 0;      int pos = -1;      Token token;            while ((token = stream.next()) != null) {        String term = token.termText();        if (term.length() == 0) continue; // nothing to do//        if (DEBUG) System.err.println("token='" + term + "'");        numTokens++;        pos += token.getPositionIncrement();                ArrayIntList positions = (ArrayIntList) terms.get(term);        if (positions == null) { // term not seen before          positions = new ArrayIntList(stride);          terms.put(term, positions);        }        if (stride == 1) {          positions.add(pos);        } else {          positions.add(pos, token.startOffset(), token.endOffset());        }      }            // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()      if (numTokens > 0) {        boost = boost * docBoost; // see DocumentWriter.addDocument(...)        fields.put(fieldName, new Info(terms, numTokens, boost));        sortedFields = null;    // invalidate sorted view, if any      }    } catch (IOException e) { // can never happen      throw new RuntimeException(e);    } finally {      try {        if (stream != null) stream.close();      } catch (IOException e2) {        throw new RuntimeException(e2);      }    }  }    /**   * Creates and returns a searcher that can be used to execute arbitrary   * Lucene queries and to collect the resulting query results as hits.   *    * @return a searcher   */  public IndexSearcher createSearcher() {    MemoryIndexReader reader = new MemoryIndexReader();    IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !!    reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity()    return searcher;  }    /**   * Convenience method that efficiently returns the relevance score by   * matching this index against the given Lucene query expression.   *    * @param query   *            an arbitrary Lucene query to run against this index   * @return the relevance score of the matchmaking; A number in the range   *         [0.0 .. 1.0], with 0.0 indicating no match. The higher the number
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -