📄 memoryindex.java

📁 Lucene a java open-source SearchEngine Framework
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
   * @see Field#setBoost(float)   */  public void addField(String fieldName, TokenStream stream, float boost) {    /*     * Note that this method signature avoids having a user call new     * o.a.l.d.Field(...) which would be much too expensive due to the     * String.intern() usage of that class.     *      * More often than not, String.intern() leads to serious performance     * degradations rather than improvements! If you're curious why, check     * out the JDK's native code, see how it oscillates multiple times back     * and forth between Java code and native code on each intern() call,     * only to end up using a plain vanilla java.util.HashMap on the Java     * heap for it's interned strings! String.equals() has a small cost     * compared to String.intern(), trust me. Application level interning     * (e.g. a HashMap per Directory/Index) typically leads to better     * solutions than frequent hidden low-level calls to String.intern().     *      * Perhaps with some luck, Lucene's Field.java (and Term.java) and     * cousins could be fixed to not use String.intern(). Sigh :-(     */    try {      if (fieldName == null)        throw new IllegalArgumentException("fieldName must not be null");      if (stream == null)          throw new IllegalArgumentException("token stream must not be null");      if (boost <= 0.0f)          throw new IllegalArgumentException("boost factor must be greater than 0.0");      if (fields.get(fieldName) != null)        throw new IllegalArgumentException("field must not be added more than once");            HashMap terms = new HashMap();      int numTokens = 0;      int pos = -1;      Token token;            while ((token = stream.next()) != null) {        String term = token.termText();        if (term.length() == 0) continue; // nothing to do//        if (DEBUG) System.err.println("token='" + term + "'");        numTokens++;        pos += token.getPositionIncrement();                ArrayIntList positions = (ArrayIntList) terms.get(term);        if (positions == null) { // term not seen before          positions = new ArrayIntList(stride);          terms.put(term, positions);        }        if (stride == 1) {          positions.add(pos);        } else {          positions.add(pos, token.startOffset(), token.endOffset());        }      }            // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()      if (numTokens > 0) {        boost = boost * docBoost; // see DocumentWriter.addDocument(...)        fields.put(fieldName, new Info(terms, numTokens, boost));        sortedFields = null;    // invalidate sorted view, if any      }    } catch (IOException e) { // can never happen      throw new RuntimeException(e);    } finally {      try {        if (stream != null) stream.close();      } catch (IOException e2) {        throw new RuntimeException(e2);      }    }  }    /**   * Creates and returns a searcher that can be used to execute arbitrary   * Lucene queries and to collect the resulting query results as hits.   *    * @return a searcher   */  public IndexSearcher createSearcher() {    MemoryIndexReader reader = new MemoryIndexReader();    IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !!    reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity()    return searcher;  }    /**   * Convenience method that efficiently returns the relevance score by   * matching this index against the given Lucene query expression.   *    * @param query   *            an arbitrary Lucene query to run against this index   * @return the relevance score of the matchmaking; A number in the range   *         [0.0 .. 1.0], with 0.0 indicating no match. The higher the number   *         the better the match.   * @see org.apache.lucene.queryParser.QueryParser#parse(String)   */  public float search(Query query) {    if (query == null)       throw new IllegalArgumentException("query must not be null");        Searcher searcher = createSearcher();    try {      final float[] scores = new float[1]; // inits to 0.0f (no match)      searcher.search(query, new HitCollector() {        public void collect(int doc, float score) {          scores[0] = score;        }      });      float score = scores[0];      return score;    } catch (IOException e) { // can never happen (RAMDirectory)      throw new RuntimeException(e);    } finally {      // searcher.close();      /*       * Note that it is harmless and important for good performance to       * NOT close the index reader!!! This avoids all sorts of       * unnecessary baggage and locking in the Lucene IndexReader       * superclass, all of which is completely unnecessary for this main       * memory index data structure without thread-safety claims.       *        * Wishing IndexReader would be an interface...       *        * Actually with the new tight createSearcher() API auto-closing is now       * made impossible, hence searcher.close() would be harmless and also        * would not degrade performance...       */    }     }    /**   * Returns a reasonable approximation of the main memory [bytes] consumed by   * this instance. Useful for smart memory sensititive caches/pools. Assumes   * fieldNames are interned, whereas tokenized terms are memory-overlaid.   *    * @return the main memory consumption   */  public int getMemorySize() {    // for example usage in a smart cache see nux.xom.pool.Pool        int PTR = VM.PTR;    int INT = VM.INT;    int size = 0;    size += VM.sizeOfObject(2*PTR + INT); // memory index    if (sortedFields != null) size += VM.sizeOfObjectArray(sortedFields.length);        size += VM.sizeOfHashMap(fields.size());    Iterator iter = fields.entrySet().iterator();    while (iter.hasNext()) { // for each Field Info      Map.Entry entry = (Map.Entry) iter.next();            Info info = (Info) entry.getValue();      size += VM.sizeOfObject(2*INT + 3*PTR); // Info instance vars      if (info.sortedTerms != null) size += VM.sizeOfObjectArray(info.sortedTerms.length);            int len = info.terms.size();      size += VM.sizeOfHashMap(len);      Iterator iter2 = info.terms.entrySet().iterator();      while (--len >= 0) { // for each term        Map.Entry e = (Map.Entry) iter2.next();        size += VM.sizeOfObject(PTR + 3*INT); // assumes substring() memory overlay//        size += STR + 2 * ((String) e.getKey()).length();        ArrayIntList positions = (ArrayIntList) e.getValue();        size += VM.sizeOfArrayIntList(positions.size());      }    }    return size;  }   private int numPositions(ArrayIntList positions) {    return positions.size() / stride;  }    /** sorts into ascending order (on demand), reusing memory along the way */  private void sortFields() {    if (sortedFields == null) sortedFields = sort(fields);  }    /** returns a view of the given map's entries, sorted ascending by key */  private static Map.Entry[] sort(HashMap map) {    int size = map.size();    Map.Entry[] entries = new Map.Entry[size];        Iterator iter = map.entrySet().iterator();    for (int i=0; i < size; i++) {      entries[i] = (Map.Entry) iter.next();    }        if (size > 1) Arrays.sort(entries, termComparator);    return entries;  }    /**   * Returns a String representation of the index data for debugging purposes.   *    * @return the string representation   */  public String toString() {    StringBuffer result = new StringBuffer(256);        sortFields();       int sumChars = 0;    int sumPositions = 0;    int sumTerms = 0;        for (int i=0; i < sortedFields.length; i++) {      Map.Entry entry = sortedFields[i];      String fieldName = (String) entry.getKey();      Info info = (Info) entry.getValue();      info.sortTerms();      result.append(fieldName + ":\n");            int numChars = 0;      int numPositions = 0;      for (int j=0; j < info.sortedTerms.length; j++) {        Map.Entry e = info.sortedTerms[j];        String term = (String) e.getKey();        ArrayIntList positions = (ArrayIntList) e.getValue();        result.append("\t'" + term + "':" + numPositions(positions) + ":");        result.append(positions.toString(stride)); // ignore offsets        result.append("\n");        numPositions += numPositions(positions);        numChars += term.length();      }            result.append("\tterms=" + info.sortedTerms.length);      result.append(", positions=" + numPositions);      result.append(", Kchars=" + (numChars/1000.0f));      result.append("\n");      sumPositions += numPositions;      sumChars += numChars;      sumTerms += info.sortedTerms.length;    }        result.append("\nfields=" + sortedFields.length);    result.append(", terms=" + sumTerms);    result.append(", positions=" + sumPositions);    result.append(", Kchars=" + (sumChars/1000.0f));    return result.toString();  }      ///////////////////////////////////////////////////////////////////////////////  // Nested classes:  ///////////////////////////////////////////////////////////////////////////////  /**   * Index data structure for a field; Contains the tokenized term texts and   * their positions.   */  private static final class Info implements Serializable {        /**     * Term strings and their positions for this field: Map <String     * termText, ArrayIntList positions>     */    private final HashMap terms;         /** Terms sorted ascending by term text; computed on demand */    private transient Map.Entry[] sortedTerms;        /** Number of added tokens for this field */    private final int numTokens;        /** Boost factor for hits for this field */    private final float boost;    /** Term for this field's fieldName, lazily computed on demand */    public transient Term template;    private static final long serialVersionUID = 2882195016849084649L;      public Info(HashMap terms, int numTokens, float boost) {      this.terms = terms;      this.numTokens = numTokens;      this.boost = boost;    }        /**     * Sorts hashed terms into ascending order, reusing memory along the     * way. Note that sorting is lazily delayed until required (often it's     * not required at all). If a sorted view is required then hashing +     * sort + binary search is still faster and smaller than TreeMap usage     * (which would be an alternative and somewhat more elegant approach,     * apart from more sophisticated Tries / prefix trees).     */    public void sortTerms() {      if (sortedTerms == null) sortedTerms = sort(terms);    }            /** note that the frequency can be calculated as numPosition(getPositions(x)) */    public ArrayIntList getPositions(String term) {      return (ArrayIntList) terms.get(term);    }    /** note that the frequency can be calculated as numPosition(getPositions(x)) */    public ArrayIntList getPositions(int pos) {      return (ArrayIntList) sortedTerms[pos].getValue();    }        public float getBoost() {      return boost;    }      }      ///////////////////////////////////////////////////////////////////////////////  // Nested classes:  ///////////////////////////////////////////////////////////////////////////////  /**   * Efficient resizable auto-expanding list holding <code>int</code> elements;   * implemented with arrays.   */  private static final class ArrayIntList implements Serializable {    private int[] elements;    private int size = 0;        private static final long serialVersionUID = 2282195016849084649L;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -