📄 multitermindexiterator.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.index;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.ints.IntSet;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator;import it.unimi.dsi.mg4j.search.DocumentIterator;import it.unimi.dsi.util.Interval;import it.unimi.dsi.mg4j.search.IntervalIterator;import it.unimi.dsi.mg4j.search.OrDocumentIterator;import it.unimi.dsi.mg4j.search.score.BM25Scorer;import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;import java.io.IOException;/** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.** <P>This class adds to {@link it.unimi.dsi.mg4j.search.AbstractUnionDocumentIterator}* an interval interator generating the OR of the intervals returned for each of the documents involved.* The main difference with an {@link OrDocumentIterator} built on the same array of component iterators* is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum* of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The* latter is by default the maximum frequency of a component iterator, but it can be set * at {@link MultiTermIndexIterator#getInstance(int, Index, IndexIterator[]) construction time}.* * <p>The main <i>raison d'&ecirc;tre</i> of this class is support for query expansion: a blind application* of {@link OrDocumentIterator} to an array of index iterators would mislead scorers such as {@link BM25Scorer}* because low-frequency terms (e.g., <i>hapax legomena</i>) would be responsible for most of the score. */public class MultiTermIndexIterator extends AbstractUnionDocumentIterator implements IndexIterator {	@SuppressWarnings("unused")	private static final boolean ASSERTS = false;		/** Value to be used for term frequency, or {@link Integer#MIN_VALUE} to use the max; in any case, this attribute is used to cache	 *  frequency after the first call to {@link #frequency()}. */	private int frequency;	/** The term of this iterator. */	protected String term;	/** The id of this iterator. */	protected int id;	/** The count of the last returned document. */	private int count = -1;	/** Whether all underlying index iterators have counts. */	private final boolean hasCounts; 	/** Whether all underlying index iterators have positions. */	private final boolean hasPositions;		/** Returns an index iterator that merges the given array of iterators.	 *  This method requires that at least one iterator is provided. The frequency is computed as a max,	 *  and {@link #index()} will return the result of the same method on the first iterator.	 * 	 * @param indexIterator the iterators to be joined (at least one).	 * @return a merged index iterator. 	 * @throws IllegalArgumentException if no iterators were provided.	 */	public static IndexIterator getInstance( final IndexIterator... indexIterator  ) throws IOException {		return getInstance( Integer.MIN_VALUE, indexIterator );	}	/** Returns an index iterator that merges the given array of iterators.	 * 	 * <P>Note that the special case of the empty and of the singleton arrays	 * are handled efficiently. The frequency is computed as a max, and	 * {@link #index()} will return <code>index</code>.	 * 	 * @param index the index that wil be returned by {@link #index()}.	 * @param indexIterator the iterators to be joined.	 * @return a merged index iterator. 	 */	public static IndexIterator getInstance( final Index index, final IndexIterator... indexIterator  ) throws IOException {		return getInstance( Integer.MIN_VALUE, index, indexIterator );	}	/** Returns an index iterator that merges the given array of iterators.	 *  This method requires that at least one iterator is provided.	 * 	 * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).	 * @param indexIterator the iterators to be joined (at least one).	 * @return a merged index iterator. 	 * @throws IllegalArgumentException if no iterators were provided, or they run on different indices.	 */	public static IndexIterator getInstance( final int defaultFrequency, final IndexIterator... indexIterator  ) throws IOException {		if ( indexIterator.length == 0 ) throw new IllegalArgumentException();		return getInstance( defaultFrequency, indexIterator[ 0 ].index(), indexIterator );	}	/** Returns an index iterator that merges the given array of iterators.	 * 	 * <P>Note that the special case of the empty and of the singleton arrays	 * are handled efficiently. 	 * 	 * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).	 * @param index the index that wil be returned by {@link #index()}.	 * @param indexIterator the iterators to be joined.	 * @return a merged index iterator. 	 * @throws IllegalArgumentException if there is some iterator on an index different from <code>index</code>.	 */	public static IndexIterator getInstance( final int defaultFrequency, final Index index, final IndexIterator... indexIterator  ) throws IOException {		if ( indexIterator.length == 0 ) return index.emptyIndexIterator;		if ( indexIterator.length == 1 ) return indexIterator[ 0 ];		return new MultiTermIndexIterator( defaultFrequency, indexIterator );	}		/** Creates a new document iterator that merges the given array of iterators. 	 * 	 * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).  	 * @param indexIterator the iterators to be joined.	 */	@SuppressWarnings("cast")	protected MultiTermIndexIterator( final int defaultFrequency, final IndexIterator... indexIterator ) throws IOException {		super( (DocumentIterator[]) indexIterator );		this.frequency = defaultFrequency;		boolean havePositions = true, haveCounts = true;		for( IndexIterator i: indexIterator ) {			if ( ! i.index().hasCounts ) haveCounts = false;			if ( ! i.index().hasPositions ) havePositions = false;						}				hasCounts = haveCounts;		hasPositions = havePositions;	}	protected IntervalIterator getComposedIntervalIterator( final Index index ) {		return new MultiTermIntervalIterator();	}	@Override	public int skipTo( final int n ) throws IOException {		if ( last >= n ) return last;		// We invalidate count before calling the superclass method.		count = -1;		return super.skipTo( n );	}		public int nextDocument() throws IOException {		// We invalidate count before calling the superclass method.		count = -1;		return super.nextDocument();	}		/** The count is the sum of counts of those component iterators positioned on the current document.	 * 	 *  @return the sum of counts.	 */	public int count() throws IOException {		if ( ! hasCounts ) throw new IllegalStateException( "Some of the underlying iterators do not have counts" );		if ( last == -1 ) throw new IllegalStateException();		if ( count == -1 ) {			int count = 0;			for ( int i = computeFront(); i-- != 0; ) count += indexIterator[ front[ i ] ].count();			this.count = count;		}		return count;	}	/** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators. 	 * 	 * @return the frequency.	 */	public int frequency() throws IOException {		if ( frequency != Integer.MIN_VALUE ) return frequency;		int frequency = Integer.MIN_VALUE;		for ( int i = n; i-- != 0; ) frequency = Math.max( frequency, indexIterator[ i ].frequency() );		return this.frequency = frequency; // caching it!	}	public void term( final CharSequence term ) {		this.term = term == null ? null : term.toString();	}	public String term() { 		return term;	}	public int termNumber() {		// TODO: this is not particularly sensible		return indexIterator[ 0 ].termNumber();	}		public void id( final int id ) {		this.id = id;	}		public int id() {		return id;	}	public Index index() {		return soleIndex;	}	/** This method is not implemented by this class.	 */	public Payload payload() {		throw new UnsupportedOperationException();	}	public int[] positionArray() throws IOException {		if ( ! hasPositions ) throw new IllegalStateException( "Some of the underlying iterators do not have positions" );		// If the front contains a single element, we can just use its position array.		if ( computeFront() == 1 ) return indexIterator[ front[ 0 ] ].positionArray();				final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator)intervalIterator();		multiTermIntervalIterator.drain();		return multiTermIntervalIterator.cache;	}	public IntIterator positions() throws IOException {				return IntIterators.wrap( positionArray(), 0, count );	}	public int positions( int[] position ) throws IOException {		int c = count;		if ( position.length < c ) return -c;		final int[] cache = positionArray();		for( int i = c; i-- != 0; ) position[ i ] = cache[ i ];		return c;	}		@Override	public boolean accept( DocumentIteratorVisitor visitor ) throws IOException {		return visitor.visit( this );	}	@Override	public boolean acceptOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException {		return visitor.visit( this );	}			/** An optimised interval iterator with the same semantics as that implemented	 *  by {@link OrDocumentIterator}, but not allowing duplicate positions.s	 *  	 *  <p>This iterator provides an additional {@link #drain()} method that exhausts the	 *  merge queue, leaving however the returned elements in the {@link #cache} array. Moreover,	 *  the internal state of the iterator is modified so that it continues to behave normally,	 *  returning however its results from {@link #cache}. In this way we can easily provide	 *  efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()},	 *  and {@link IndexIterator#positions(int[])}.	 */	private class MultiTermIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator {		@SuppressWarnings({ "unused", "hiding" })		private final static boolean DEBUG = false;		@SuppressWarnings("hiding")		private final static boolean ASSERTS = false;		/** A heap-based indirect priority queue used to keep track of the currently scanned positions. */		private final IntHeapSemiIndirectPriorityQueue positionQueue;		/** The cached results of this iterator. */		public int[] cache;		/** The number of results emitted by this iterator since the last call to {@link #reset()}. */		private int emitted;		/** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */		private int extracted;		public MultiTermIntervalIterator() {			super( n );			positionQueue = new IntHeapSemiIndirectPriorityQueue( curr );			cache = new int[ 4 ];		}		public void reset() throws IOException {			emitted = extracted = 0;			next = null;			positionQueue.clear();			for ( int i = computeFront(), k; i-- != 0; ) {				k = front[ i ];				position[ k ] = indexIterator[ k ].positionArray();				count[ k ] = indexIterator[ k ].count();				curr[ k ] = position[ k ][ 0 ];				currPos[ k ] = 0;				positionQueue.enqueue( k );			}			if ( ASSERTS ) assert ! positionQueue.isEmpty();		}		public void intervalTerms( final IntSet terms ) {			// TODO: this is not particularly sensible			terms.add( indexIterator[ 0 ].termNumber() );		}				public Interval nextInterval() {			if ( next != null ) {				final Interval result = next;				next = null;				return result;			}						if ( emitted < extracted ) return Interval.valueOf( cache[ emitted++ ] );			if ( positionQueue.isEmpty() ) return null;			final int first = positionQueue.first();			if ( extracted == cache.length ) cache = IntArrays.grow( cache, extracted + 1 );			cache[ extracted++ ] = curr[ first ];			if ( ++currPos[ first ] < count[ first ] ) {				curr[ first ] = position[ first ][ currPos[ first ] ];				positionQueue.changed();				if ( curr[ positionQueue.first() ] == cache[ extracted - 1 ] ) throw new IllegalArgumentException( "Duplicate positions in " + this );			}			else positionQueue.dequeue();							return Interval.valueOf( cache[ emitted++ ] );		}		public int extent() {			return 1;		}				/** Drains all elements from the queue, stores them in {@link #cache} and		 * restores {@link #emitted} so that the iterators continues to work transparently. 		 */				public void drain() {			final int emittedNow = emitted - ( next != null ? 1 : 0 );			next = null;			while( nextInterval() != null );			emitted = emittedNow;		} 	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -