📄 consecutivedocumentiterator.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.search;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.ints.IntSet;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.util.Interval;import it.unimi.dsi.util.Intervals;import java.io.IOException;/** An iterator returning documents containing consecutive intervals (in query order)  * satisfying the underlying queries. *  * <p>As an additional service, this class makes it possible to specify <em>gaps</em> between * intervals. If gaps are specified, a match will satisfy the condition * that the left extreme of the first interval is larger than or equal to the * first gap, the left extreme of the second interval is larger than * the right extreme of the first interval plus the second gap, and so on.  The standard * semantics corresponds thus to the everywhere zero gap array.  *  * <p>This semantics * makes it possible to peform phrasal searches &ldquo;with holes&rdquo;, typically * because of stopwords that have not been indexed. Note that it is possible to specify * a gap <em>before</em> the first interval, but not <em>after</em> the last interval, * as in general the document length is not known at this level of query resolution. *  * <p>This class will handle correctly {@link IntervalIterators#TRUE TRUE} iterators; in this * case, the semantics is defined as follows: an interval is in the output if it is formed by the union of disjoint intervals, * one from each input list, and each gap of value <var>k</var> corresponds to <var>k</var> iterators * returning all document positions as singleton intervals. Since {@link IntervalIterators#TRUE TRUE} represents a list containing just * the empty interval, the result is equivalent to dropping {@link IntervalIterators#TRUE TRUE} iterators from the input; as * a consequence, the gap of a {@link IntervalIterators#TRUE TRUE} iterator is merged with that of the following iterator. *  * <p><strong>Warning</strong>: In case gaps are specified, the mathematically correct semantics would require that * gaps before {@link IntervalIterators#TRUE TRUE} iterators that are not followed by any non-{@link IntervalIterators#TRUE TRUE} iterators * have the effect of enlarging the resulting intervals <em>on the right side</em>. However, * this behaviour is very difficult to implement at this level because document lengths are not known. For this * reason, if one or more {@link IntervalIterators#TRUE TRUE} iterators appear a the end of the component iterator list they will be simply dropped.   */public class ConsecutiveDocumentIterator extends AbstractOrderedIntervalDocumentIterator {	/** The gap array. This is essentially the array provided at construction time; however, if a {@link ConsecutiveIndexIntervalIterator}	 * is requested by {@link #getComposedIntervalIterator(Index)} this array will be used to store <em>cumulative</em> gaps. */	private final int gap[];		/** Returns a document iterator that computes the consecutive AND of the given array of iterators.	 * 	 * <P>Note that the special case of the empty and of the singleton arrays	 * are handled efficiently.	 *  	 * @param numberOfDocuments the number of documents; relevant only if <code>it</code> has zero length.	 * @param documentIterator the iterators to be composed.	 * @return a document iterator that computes the consecutive AND of <code>it</code>. 	 * @throws IOException 	 */	public static DocumentIterator getInstance( final int numberOfDocuments, final DocumentIterator... documentIterator ) throws IOException {		if ( documentIterator.length == 0 ) return NotDocumentIterator.getInstance( DocumentIterators.EMPTY_ITERATOR, numberOfDocuments );		if ( documentIterator.length == 1 ) return documentIterator[ 0 ];		return new ConsecutiveDocumentIterator( documentIterator, null );	}		/** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators.	 * 	 * <P>Note that the special case of the singleton array is handled efficiently.	 * 	 * @param documentIterator the iterators to be composed (at least one).	 * @return a document iterator that computes the consecutive AND of <code>documentIterator</code>. 	 * @throws IOException 	 */	public static DocumentIterator getInstance( final DocumentIterator... documentIterator ) throws IOException {		if ( documentIterator.length == 1 ) return documentIterator[ 0 ];		return getInstance( -1, documentIterator );	}		/** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators, adding	 * gaps between intervals.	 * 	 * <p>A match will satisfy the condition	 * that the left extreme of the first interval is larger than or equal to the	 * first gap, the left extreme of the second interval is larger than 	 * the right extreme of the first interval plus the second gap, and so on. This semantics	 * makes it possible to perform phrasal searches &ldquo;with holes&rdquo;, typically	 * because of stopwords that have not been indexed.	 * 	 * @param documentIterator the iterators to be composed (at least one).	 * @param gap an array of gaps parallel to <code>documentIterator</code>, or <code>null</code> for no gaps. 	 * @return a document iterator that computes the consecutive AND of <code>documentIterator</code> using the given gaps.  	 * @throws IOException 	 */	public static DocumentIterator getInstance( final DocumentIterator documentIterator[], final int gap[] ) throws IOException {		if ( gap != null && gap.length != documentIterator.length ) throw new IllegalArgumentException( "The number of gaps (" + gap.length + ") is not equal to the number of document iterators (" + documentIterator.length +")" );		if ( documentIterator.length == 1 && ( gap == null || gap[ 0 ] == 0 ) ) return documentIterator[ 0 ];		return new ConsecutiveDocumentIterator( documentIterator, gap );	}		protected ConsecutiveDocumentIterator( final DocumentIterator[] documentIterator, final int[] gap ) throws IOException {		super( documentIterator );		if ( gap == null ) this.gap = new int[ n ];		else this.gap = gap.clone();	}	protected IntervalIterator getComposedIntervalIterator( final Index unused ) {		if ( ASSERTS ) assert unused == soleIndex;		if ( indexIterator == null ) return new ConsecutiveIntervalIterator( gap );		// In this case, gap must be made cumulative.		for( int i = 1; i < n; i++ ) gap[ i ] += gap[ i - 1 ] + 1;		return new ConsecutiveIndexIntervalIterator( gap );	}		/** An interval iterator returning the BLOCK of the component iterator 	 * (i.e., intervals made of sequences of consecutive intervals	 * of the component iterator, in the given order). 	 * 	 * <p>In this implementation, {@link #advanced} is	 * never true when {@link AbstractOrderedIntervalIterator#endOfProcess} is true. 	 */		private class ConsecutiveIntervalIterator extends AbstractOrderedIntervalIterator {		/** A cached reference to the gap array. */		@SuppressWarnings("hiding")		private final int[] gap;		/** The actual gaps. They depend on whether some {@link IntervalIterators#TRUE} iterator reduces the iterator array. */		private final int[] actualGap;		/** Whether the scan is over. */		private boolean endOfProcess;		/** The number of non-{@link IntervalIterators#TRUE} interval iterator. */		private int m;		public ConsecutiveIntervalIterator( final int[] gap ) {			this.gap = gap;			// The enlargement is made necessary by the filling long in reset().			this.actualGap = new int[ n + 1 ];		}		/** Loads {@link #curr} with the first interval from each non-{@link IntervalIterators#TRUE} iterator, leaving		 * in {@link #m} the number of non-{@link IntervalIterators#TRUE} iterators, and in {@link #actualGap}		 * the gaps to be used for those {@link #m} iterators.		 */		public void reset() throws IOException {			final int[] actualGap = this.actualGap;			final int[] gap = this.gap;			final IntervalIterator[] intervalIterator = this.intervalIterator;						actualGap[ m = 0 ] = 0;			int i;			for( i = 0; i < n; i++ ) {				intervalIterator[ m ] = documentIterator[ i ].intervalIterator();				if ( ASSERTS ) assert intervalIterator[ m ].hasNext();				actualGap[ m ] += gap[ i ];				curr[ m ] = Intervals.MINUS_INFINITY;				if ( intervalIterator[ m ] != IntervalIterators.TRUE ) actualGap[ ++m ] = 1;			}			if ( m == 0 ) throw new IllegalStateException();						next = null;			do	curr[ 0 ] = intervalIterator[ 0 ].nextInterval(); while( curr[ 0 ] != null && curr[ 0 ].left < actualGap[ 0 ] );			if ( ! ( endOfProcess = curr[ 0 ] == null ) ) next = align();		}				public void intervalTerms( final IntSet terms ) {			for( int i = m; i-- != 0; ) intervalIterator[ i ].intervalTerms( terms );		}				private Interval align() throws IOException {			if ( DEBUG ) System.err.println( this + ".align()" );						final Interval[] curr = this.curr;			final int[] actualGap = this.actualGap;			final IntervalIterator[] intervalIterator = this.intervalIterator;						if ( DEBUG ) System.err.println( java.util.Arrays.asList( curr ) );			int k = 0;			Interval interval;			while( k < m ) {				for ( k = 1; k < m; k++ ) {					while ( curr[ k ].left < curr[ k - 1 ].right + actualGap[ k ] )						if ( ( curr[ k ] = intervalIterator[ k ].nextInterval() ) == null ) {							endOfProcess = true;							return null;						}					if ( curr[ k ].left > curr[ k - 1 ].right + actualGap[ k ] ) {						final int limit = curr[ k ].left - actualGap[ k ];						/* Note that we are not interested in intervals whole right extreme is						 * smaller than limit, as it is impossible to find a match in that case. */						while ( ( interval = intervalIterator[ 0 ].nextInterval() ) != null && interval.right < limit );						if ( endOfProcess = interval == null ) return null;						curr[ 0 ] = interval;						break;					} 				}			}						return Interval.valueOf( curr[ 0 ].left - actualGap[ 0 ], curr[ m - 1 ].right );		}				public Interval nextInterval() throws IOException {			if ( next != null ) {				final Interval result = next;				next = null; 				return result;			}			if ( endOfProcess ) return null;						if ( ( curr[ 0 ] = intervalIterator[ 0 ].nextInterval() ) == null ) {				endOfProcess = true;				return null;			}			return align();		}				public int extent() {			int s = 0;			for ( int i = m; i-- != 0; ) s += intervalIterator[ i ].extent() + actualGap[ i ];			return s;		}	}	private class ConsecutiveIndexIntervalIterator extends AbstractOrderedIndexIntervalIterator {		/** A cached reference to the gap array. */		@SuppressWarnings("hiding")		private final int[] gap;		/** Whether the scan is over. */		private boolean endOfProcess;				public ConsecutiveIndexIntervalIterator( final int[] gap ) {			this.gap = gap;		}				/** Resets the iterator by calling the superclass method, and then aligning all iterators.		 * 		 * <p>Note that in this class {@link #curr} is used to denote the value of the current position		 * minus the corresponding {@linkplain #gap cumulative gap}; this method updates {@link #curr} accordingly. */		public void reset() throws IOException {			final int[][] position = this.position;			final int[] currPos = this.currPos;			final int[] curr = this.curr;			final int[] count = this.count;			final int[] gap = this.gap;			IntArrays.fill( currPos, 0 );			for( int i = n; i-- != 0; ) {				count[ i ] = indexIterator[ i ].count();				position[ i ] = indexIterator[ i ].positionArray();				curr[ i ] = position[ i ][ 0 ];			}			endOfProcess = false;			for( int i = n; i-- != 0; ) curr[ i ] -= gap[ i ];						if ( gap[ 0 ] != 0 ) {				// Go beyond the 0-th gap. This must be done just once.				next = null;				while ( curr[ 0 ] < 0 && ++currPos[ 0 ] < count[ 0 ] ) curr[ 0 ] = position[ 0 ][ currPos[ 0 ] ] - gap[ 0 ];				endOfProcess = currPos[ 0 ] == count[ 0 ];			}			if ( ! endOfProcess ) next = align();		}		public void intervalTerms( final IntSet terms ) {			for( int i = n; i-- != 0; ) terms.add( indexIterator[ i ].termNumber() );		}				private Interval align() {			if ( DEBUG ) System.err.println( this + ".align()" );			final int[][] position = this.position;			final int[] currPos = this.currPos;			final int[] curr = this.curr;			final int[] count = this.count;			final int[] gap = this.gap;						int c, k = 1, l = n <= 2 ? 0 : 2; // This is actually 2 % n			boolean oneRoundDone = false;			int[] p;			int start = curr[ 0 ];						for(;;) {				p = position[ k ];				c = currPos[ k ];				// First, we try to align the k-th term.				while ( c < count[ k ] && p[ c ] - gap[ k ] < start ) c++;				// If we exhaust the term positions, it's all over.				if ( c == count[ k ] ) {					endOfProcess = true;					return null;				}				curr[ k ] = p[ currPos[ k ] = c ] - gap[ k ];				// If we went beyond start + k, we must update start.				if ( curr[ k ] > start ) start = curr[ k ]; 								// If k == 0, it means we have made a full round of alignment, so the next check is now valid.				oneRoundDone |= ( k == 0 );				// If oneRoundDone, all current normalised positions (curr[ x ] - gap[ x ]) are squeezed between start and curr[ l ].				if ( oneRoundDone && curr[ l ] == start ) return Interval.valueOf( curr[ 0 ], curr[ 0 ] + gap[ n - 1 ] ); 				k = l;								if ( ( l = l + 1 ) == n ) l = 0;			}		}				public Interval nextInterval() {			if ( next != null ) {				final Interval result = next;				next = null; 				return result;			}			if ( endOfProcess ) return null;						if ( ++currPos[ 0 ] < count[ 0 ] ) curr[ 0 ] = position[ 0 ][ currPos[ 0 ] ] - gap[ 0 ];			else {				endOfProcess = true;				return null;			}			return align();					}	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -