📄 memoryindex.java
字号:
* @see Field#setBoost(float) */ public void addField(String fieldName, TokenStream stream, float boost) { /* * Note that this method signature avoids having a user call new * o.a.l.d.Field(...) which would be much too expensive due to the * String.intern() usage of that class. * * More often than not, String.intern() leads to serious performance * degradations rather than improvements! If you're curious why, check * out the JDK's native code, see how it oscillates multiple times back * and forth between Java code and native code on each intern() call, * only to end up using a plain vanilla java.util.HashMap on the Java * heap for it's interned strings! String.equals() has a small cost * compared to String.intern(), trust me. Application level interning * (e.g. a HashMap per Directory/Index) typically leads to better * solutions than frequent hidden low-level calls to String.intern(). * * Perhaps with some luck, Lucene's Field.java (and Term.java) and * cousins could be fixed to not use String.intern(). Sigh :-( */ try { if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (stream == null) throw new IllegalArgumentException("token stream must not be null"); if (boost <= 0.0f) throw new IllegalArgumentException("boost factor must be greater than 0.0"); if (fields.get(fieldName) != null) throw new IllegalArgumentException("field must not be added more than once"); HashMap terms = new HashMap(); int numTokens = 0; int pos = -1; Token token; while ((token = stream.next()) != null) { String term = token.termText(); if (term.length() == 0) continue; // nothing to do// if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; pos += token.getPositionIncrement(); ArrayIntList positions = (ArrayIntList) terms.get(term); if (positions == null) { // term not seen before positions = new ArrayIntList(stride); terms.put(term, positions); } if (stride == 1) { positions.add(pos); } else { positions.add(pos, token.startOffset(), token.endOffset()); } } // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { boost = boost * docBoost; // see DocumentWriter.addDocument(...) fields.put(fieldName, new Info(terms, numTokens, boost)); sortedFields = null; // invalidate sorted view, if any } } catch (IOException e) { // can never happen throw new RuntimeException(e); } finally { try { if (stream != null) stream.close(); } catch (IOException e2) { throw new RuntimeException(e2); } } } /** * Creates and returns a searcher that can be used to execute arbitrary * Lucene queries and to collect the resulting query results as hits. * * @return a searcher */ public IndexSearcher createSearcher() { MemoryIndexReader reader = new MemoryIndexReader(); IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !! reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity() return searcher; } /** * Convenience method that efficiently returns the relevance score by * matching this index against the given Lucene query expression. * * @param query * an arbitrary Lucene query to run against this index * @return the relevance score of the matchmaking; A number in the range * [0.0 .. 1.0], with 0.0 indicating no match. The higher the number * the better the match. * @see org.apache.lucene.queryParser.QueryParser#parse(String) */ public float search(Query query) { if (query == null) throw new IllegalArgumentException("query must not be null"); Searcher searcher = createSearcher(); try { final float[] scores = new float[1]; // inits to 0.0f (no match) searcher.search(query, new HitCollector() { public void collect(int doc, float score) { scores[0] = score; } }); float score = scores[0]; return score; } catch (IOException e) { // can never happen (RAMDirectory) throw new RuntimeException(e); } finally { // searcher.close(); /* * Note that it is harmless and important for good performance to * NOT close the index reader!!! This avoids all sorts of * unnecessary baggage and locking in the Lucene IndexReader * superclass, all of which is completely unnecessary for this main * memory index data structure without thread-safety claims. * * Wishing IndexReader would be an interface... * * Actually with the new tight createSearcher() API auto-closing is now * made impossible, hence searcher.close() would be harmless and also * would not degrade performance... */ } } /** * Returns a reasonable approximation of the main memory [bytes] consumed by * this instance. Useful for smart memory sensititive caches/pools. Assumes * fieldNames are interned, whereas tokenized terms are memory-overlaid. * * @return the main memory consumption */ public int getMemorySize() { // for example usage in a smart cache see nux.xom.pool.Pool int PTR = VM.PTR; int INT = VM.INT; int size = 0; size += VM.sizeOfObject(2*PTR + INT); // memory index if (sortedFields != null) size += VM.sizeOfObjectArray(sortedFields.length); size += VM.sizeOfHashMap(fields.size()); Iterator iter = fields.entrySet().iterator(); while (iter.hasNext()) { // for each Field Info Map.Entry entry = (Map.Entry) iter.next(); Info info = (Info) entry.getValue(); size += VM.sizeOfObject(2*INT + 3*PTR); // Info instance vars if (info.sortedTerms != null) size += VM.sizeOfObjectArray(info.sortedTerms.length); int len = info.terms.size(); size += VM.sizeOfHashMap(len); Iterator iter2 = info.terms.entrySet().iterator(); while (--len >= 0) { // for each term Map.Entry e = (Map.Entry) iter2.next(); size += VM.sizeOfObject(PTR + 3*INT); // assumes substring() memory overlay// size += STR + 2 * ((String) e.getKey()).length(); ArrayIntList positions = (ArrayIntList) e.getValue(); size += VM.sizeOfArrayIntList(positions.size()); } } return size; } private int numPositions(ArrayIntList positions) { return positions.size() / stride; } /** sorts into ascending order (on demand), reusing memory along the way */ private void sortFields() { if (sortedFields == null) sortedFields = sort(fields); } /** returns a view of the given map's entries, sorted ascending by key */ private static Map.Entry[] sort(HashMap map) { int size = map.size(); Map.Entry[] entries = new Map.Entry[size]; Iterator iter = map.entrySet().iterator(); for (int i=0; i < size; i++) { entries[i] = (Map.Entry) iter.next(); } if (size > 1) Arrays.sort(entries, termComparator); return entries; } /** * Returns a String representation of the index data for debugging purposes. * * @return the string representation */ public String toString() { StringBuffer result = new StringBuffer(256); sortFields(); int sumChars = 0; int sumPositions = 0; int sumTerms = 0; for (int i=0; i < sortedFields.length; i++) { Map.Entry entry = sortedFields[i]; String fieldName = (String) entry.getKey(); Info info = (Info) entry.getValue(); info.sortTerms(); result.append(fieldName + ":\n"); int numChars = 0; int numPositions = 0; for (int j=0; j < info.sortedTerms.length; j++) { Map.Entry e = info.sortedTerms[j]; String term = (String) e.getKey(); ArrayIntList positions = (ArrayIntList) e.getValue(); result.append("\t'" + term + "':" + numPositions(positions) + ":"); result.append(positions.toString(stride)); // ignore offsets result.append("\n"); numPositions += numPositions(positions); numChars += term.length(); } result.append("\tterms=" + info.sortedTerms.length); result.append(", positions=" + numPositions); result.append(", Kchars=" + (numChars/1000.0f)); result.append("\n"); sumPositions += numPositions; sumChars += numChars; sumTerms += info.sortedTerms.length; } result.append("\nfields=" + sortedFields.length); result.append(", terms=" + sumTerms); result.append(", positions=" + sumPositions); result.append(", Kchars=" + (sumChars/1000.0f)); return result.toString(); } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// /** * Index data structure for a field; Contains the tokenized term texts and * their positions. */ private static final class Info implements Serializable { /** * Term strings and their positions for this field: Map <String * termText, ArrayIntList positions> */ private final HashMap terms; /** Terms sorted ascending by term text; computed on demand */ private transient Map.Entry[] sortedTerms; /** Number of added tokens for this field */ private final int numTokens; /** Boost factor for hits for this field */ private final float boost; /** Term for this field's fieldName, lazily computed on demand */ public transient Term template; private static final long serialVersionUID = 2882195016849084649L; public Info(HashMap terms, int numTokens, float boost) { this.terms = terms; this.numTokens = numTokens; this.boost = boost; } /** * Sorts hashed terms into ascending order, reusing memory along the * way. Note that sorting is lazily delayed until required (often it's * not required at all). If a sorted view is required then hashing + * sort + binary search is still faster and smaller than TreeMap usage * (which would be an alternative and somewhat more elegant approach, * apart from more sophisticated Tries / prefix trees). */ public void sortTerms() { if (sortedTerms == null) sortedTerms = sort(terms); } /** note that the frequency can be calculated as numPosition(getPositions(x)) */ public ArrayIntList getPositions(String term) { return (ArrayIntList) terms.get(term); } /** note that the frequency can be calculated as numPosition(getPositions(x)) */ public ArrayIntList getPositions(int pos) { return (ArrayIntList) sortedTerms[pos].getValue(); } public float getBoost() { return boost; } } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// /** * Efficient resizable auto-expanding list holding <code>int</code> elements; * implemented with arrays. */ private static final class ArrayIntList implements Serializable { private int[] elements; private int size = 0; private static final long serialVersionUID = 2282195016849084649L;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -