📄 multitermindexiterator.java
字号:
package it.unimi.dsi.mg4j.index;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.ints.IntSet;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator;import it.unimi.dsi.mg4j.search.DocumentIterator;import it.unimi.dsi.util.Interval;import it.unimi.dsi.mg4j.search.IntervalIterator;import it.unimi.dsi.mg4j.search.OrDocumentIterator;import it.unimi.dsi.mg4j.search.score.BM25Scorer;import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;import java.io.IOException;/** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.** <P>This class adds to {@link it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator}* an interval interator generating the OR of the intervals returned for each of the documents involved.* The main difference with an {@link OrDocumentIterator} built on the same array of component iterators* is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum* of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The* latter is by default the maximum frequency of a component iterator, but it can be set * at {@link MultiTermIndexIterator#getInstance(int, Index, IndexIterator[]) construction time}.* * <p>The main <i>raison d'être</i> of this class is support for query expansion: a blind application* of {@link OrDocumentIterator} to an array of index iterators would mislead scorers such as {@link BM25Scorer}* because low-frequency terms (e.g., <i>hapax legomena</i>) would be responsible for most of the score. */public class MultiTermIndexIterator extends AbstractUnionDocumentIterator implements IndexIterator { @SuppressWarnings("unused") private static final boolean ASSERTS = false; /** Value to be used for term frequency, or {@link Integer#MIN_VALUE} to use the max; in any case, this attribute is used to cache * frequency after the first call to {@link #frequency()}. */ private int frequency; /** The term of this iterator. */ protected String term; /** The id of this iterator. */ protected int id; /** The count of the last returned document. */ private int count = -1; /** Whether all underlying index iterators have counts. */ private final boolean hasCounts; /** Whether all underlying index iterators have positions. */ private final boolean hasPositions; /** Returns an index iterator that merges the given array of iterators. * This method requires that at least one iterator is provided. The frequency is computed as a max, * and {@link #index()} will return the result of the same method on the first iterator. * * @param indexIterator the iterators to be joined (at least one). * @return a merged index iterator. * @throws IllegalArgumentException if no iterators were provided. */ public static IndexIterator getInstance( final IndexIterator... indexIterator ) throws IOException { return getInstance( Integer.MIN_VALUE, indexIterator ); } /** Returns an index iterator that merges the given array of iterators. * * <P>Note that the special case of the empty and of the singleton arrays * are handled efficiently. The frequency is computed as a max, and * {@link #index()} will return <code>index</code>. * * @param index the index that wil be returned by {@link #index()}. * @param indexIterator the iterators to be joined. * @return a merged index iterator. */ public static IndexIterator getInstance( final Index index, final IndexIterator... indexIterator ) throws IOException { return getInstance( Integer.MIN_VALUE, index, indexIterator ); } /** Returns an index iterator that merges the given array of iterators. * This method requires that at least one iterator is provided. * * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max). * @param indexIterator the iterators to be joined (at least one). * @return a merged index iterator. * @throws IllegalArgumentException if no iterators were provided, or they run on different indices. */ public static IndexIterator getInstance( final int defaultFrequency, final IndexIterator... indexIterator ) throws IOException { if ( indexIterator.length == 0 ) throw new IllegalArgumentException(); return getInstance( defaultFrequency, indexIterator[ 0 ].index(), indexIterator ); } /** Returns an index iterator that merges the given array of iterators. * * <P>Note that the special case of the empty and of the singleton arrays * are handled efficiently. * * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max). * @param index the index that wil be returned by {@link #index()}. * @param indexIterator the iterators to be joined. * @return a merged index iterator. * @throws IllegalArgumentException if there is some iterator on an index different from <code>index</code>. */ public static IndexIterator getInstance( final int defaultFrequency, final Index index, final IndexIterator... indexIterator ) throws IOException { if ( indexIterator.length == 0 ) return index.emptyIndexIterator; if ( indexIterator.length == 1 ) return indexIterator[ 0 ]; return new MultiTermIndexIterator( defaultFrequency, indexIterator ); } /** Creates a new document iterator that merges the given array of iterators. * * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max). * @param indexIterator the iterators to be joined. */ @SuppressWarnings("cast") protected MultiTermIndexIterator( final int defaultFrequency, final IndexIterator... indexIterator ) throws IOException { super( (DocumentIterator[]) indexIterator ); this.frequency = defaultFrequency; boolean havePositions = true, haveCounts = true; for( IndexIterator i: indexIterator ) { if ( ! i.index().hasCounts ) haveCounts = false; if ( ! i.index().hasPositions ) havePositions = false; } hasCounts = haveCounts; hasPositions = havePositions; } protected IntervalIterator getComposedIntervalIterator( final Index index ) { return new MultiTermIntervalIterator(); } @Override public int skipTo( final int n ) throws IOException { if ( last >= n ) return last; // We invalidate count before calling the superclass method. count = -1; return super.skipTo( n ); } public int nextDocument() throws IOException { // We invalidate count before calling the superclass method. count = -1; return super.nextDocument(); } /** The count is the sum of counts of those component iterators positioned on the current document. * * @return the sum of counts. */ public int count() throws IOException { if ( ! hasCounts ) throw new IllegalStateException( "Some of the underlying iterators do not have counts" ); if ( last == -1 ) throw new IllegalStateException(); if ( count == -1 ) { int count = 0; for ( int i = computeFront(); i-- != 0; ) count += indexIterator[ front[ i ] ].count(); this.count = count; } return count; } /** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators. * * @return the frequency. */ public int frequency() throws IOException { if ( frequency != Integer.MIN_VALUE ) return frequency; int frequency = Integer.MIN_VALUE; for ( int i = n; i-- != 0; ) frequency = Math.max( frequency, indexIterator[ i ].frequency() ); return this.frequency = frequency; // caching it! } public void term( final CharSequence term ) { this.term = term == null ? null : term.toString(); } public String term() { return term; } public int termNumber() { // TODO: this is not particularly sensible return indexIterator[ 0 ].termNumber(); } public void id( final int id ) { this.id = id; } public int id() { return id; } public Index index() { return soleIndex; } /** This method is not implemented by this class. */ public Payload payload() { throw new UnsupportedOperationException(); } public int[] positionArray() throws IOException { if ( ! hasPositions ) throw new IllegalStateException( "Some of the underlying iterators do not have positions" ); // If the front contains a single element, we can just use its position array. if ( computeFront() == 1 ) return indexIterator[ front[ 0 ] ].positionArray(); final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator)intervalIterator(); multiTermIntervalIterator.drain(); return multiTermIntervalIterator.cache; } public IntIterator positions() throws IOException { return IntIterators.wrap( positionArray(), 0, count ); } public int positions( int[] position ) throws IOException { int c = count; if ( position.length < c ) return -c; final int[] cache = positionArray(); for( int i = c; i-- != 0; ) position[ i ] = cache[ i ]; return c; } @Override public boolean accept( DocumentIteratorVisitor visitor ) throws IOException { return visitor.visit( this ); } @Override public boolean acceptOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException { return visitor.visit( this ); } /** An optimised interval iterator with the same semantics as that implemented * by {@link OrDocumentIterator}, but not allowing duplicate positions.s * * <p>This iterator provides an additional {@link #drain()} method that exhausts the * merge queue, leaving however the returned elements in the {@link #cache} array. Moreover, * the internal state of the iterator is modified so that it continues to behave normally, * returning however its results from {@link #cache}. In this way we can easily provide * efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()}, * and {@link IndexIterator#positions(int[])}. */ private class MultiTermIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator { @SuppressWarnings({ "unused", "hiding" }) private final static boolean DEBUG = false; @SuppressWarnings("hiding") private final static boolean ASSERTS = false; /** A heap-based indirect priority queue used to keep track of the currently scanned positions. */ private final IntHeapSemiIndirectPriorityQueue positionQueue; /** The cached results of this iterator. */ public int[] cache; /** The number of results emitted by this iterator since the last call to {@link #reset()}. */ private int emitted; /** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */ private int extracted; public MultiTermIntervalIterator() { super( n ); positionQueue = new IntHeapSemiIndirectPriorityQueue( curr ); cache = new int[ 4 ]; } public void reset() throws IOException { emitted = extracted = 0; next = null; positionQueue.clear(); for ( int i = computeFront(), k; i-- != 0; ) { k = front[ i ]; position[ k ] = indexIterator[ k ].positionArray(); count[ k ] = indexIterator[ k ].count(); curr[ k ] = position[ k ][ 0 ]; currPos[ k ] = 0; positionQueue.enqueue( k ); } if ( ASSERTS ) assert ! positionQueue.isEmpty(); } public void intervalTerms( final IntSet terms ) { // TODO: this is not particularly sensible terms.add( indexIterator[ 0 ].termNumber() ); } public Interval nextInterval() { if ( next != null ) { final Interval result = next; next = null; return result; } if ( emitted < extracted ) return Interval.valueOf( cache[ emitted++ ] ); if ( positionQueue.isEmpty() ) return null; final int first = positionQueue.first(); if ( extracted == cache.length ) cache = IntArrays.grow( cache, extracted + 1 ); cache[ extracted++ ] = curr[ first ]; if ( ++currPos[ first ] < count[ first ] ) { curr[ first ] = position[ first ][ currPos[ first ] ]; positionQueue.changed(); if ( curr[ positionQueue.first() ] == cache[ extracted - 1 ] ) throw new IllegalArgumentException( "Duplicate positions in " + this ); } else positionQueue.dequeue(); return Interval.valueOf( cache[ emitted++ ] ); } public int extent() { return 1; } /** Drains all elements from the queue, stores them in {@link #cache} and * restores {@link #emitted} so that the iterators continues to work transparently. */ public void drain() { final int emittedNow = emitted - ( next != null ? 1 : 0 ); next = null; while( nextInterval() != null ); emitted = emittedNow; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -