📄 abstractintersectiondocumentiterator.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.search;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import java.io.IOException;import java.util.Arrays;import java.util.Comparator;import java.util.Iterator;/** An abstract iterator on documents, generating the intersection of the documents returned by  * a number of document iterators. *  * <P>To be usable, this class must be subclassed so to provide also an iterator on intervals. * Such iterators must be instantiable using {@link #getComposedIntervalIterator(Index)}. * The latter is an example of a <em>non-static factory method</em>, that is, a factory method which * depends on the enclosing instance. This pattern allows to easily specialise this class to iterators * that do different things, such as {@link it.unimi.dsi.mg4j.search.AndDocumentIterator} and * {@link it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator}, but that have a similar semantics at the document * level (the semantics may in fact be slightly different: for instance, not all document belonging * to all components will actually appear in a consecutive iterator, as there may be documents filtered * at the interval level). *  * <P>The important invariant is that <em>only</em> after a call to {@link #nextDocument()}, a call * to {@link #intervalIterator(Index)} will return an interval iterator over the document * just returned, and that for at least one index in {@link #indices()} the iterator will not be empty * or {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE}. *  * <h2>The intersection algorithm</h2> *  * <p>Since MG4J 1.1, this class implements a new intersection algorithm that should be significantly * faster than the previous one. The main idea is that of letting sparser iterator interact as much * as possible to obtain a candidate common document, and then trying to align the others. At construction * time, the component iterators are sorted so that index iterators are separated, and sorted by frequency. * Then, each time we have to align the iterators we align them greedily starting from the index * iterators, in frequency order. This has the effect of skipping very quickly (and usually by * large jumps, which are handled nicely by indices with skips), * as the main interaction happens between low-frequency index iterators. *  * <p>Moreover, this class treats in a special way *  {@linkplain PayloadPredicateDocumentIterator index iterators coming from payload-based indices}. Such * iterators are checked at the end of the alignment process,  * after all standard index iterators (and general document iterators) * are aligned. At that point, the special method {@link PayloadPredicateDocumentIterator#skipUnconditionallyTo(int)} * is used to position unconditionally such iterators and check whether the payload predicate is satisfied. * If this doesn't happen, the current candidate (obtained by alignment of standard iterators) is increased and the * whole process is restarted. This procedure guarantees that we will never search exhaustively in a  * payload-based index a document record satisfying the predicate (unless, of course, we have a query * containing just {@link PayloadPredicateDocumentIterator}s), which is very efficient if the payload-based * index uses skipping. *  */public abstract class AbstractIntersectionDocumentIterator extends AbstractCompositeDocumentIterator {	private final static boolean DEBUG = false;	private final static boolean ASSERTS = false;	/** A map from indices to interval iterators. */	final protected Reference2ReferenceArrayMap<Index,IntervalIterator> intervalIterators;	/** A map from indices to the iterators returned for the current document. The key set may	 * not contain an index because the related iterator has never been requested. Moreover,	 * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}	 * because it could be {@link IntervalIterators#TRUE TRUE} (in fact, in that case it may even	 * happen that {@link #intervalIterators} does not contain the index). */	final protected Reference2ReferenceArrayMap<Index,IntervalIterator> currentIterators;	/** An unmodifiable wrapper around {@link #currentIterators}. */	final protected Reference2ReferenceMap<Index,IntervalIterator> unmodifiableCurrentIterators;	/** The provided document iterators, suitably sorted. */	final private DocumentIterator[] sortedIterator;	/** The last element of {@link #sortedIterator}. */	final private DocumentIterator lastIterator;	/** The prefix of {@link #sortedIterator} made of {@link PayloadPredicateDocumentIterator}s. */	final private PayloadPredicateDocumentIterator[] payloadPredicateDocumentIterator;	/** Iterators in {@link #sortedIterator} up to this position (exclusive) are instances of {@link PayloadPredicateDocumentIterator}. */	final private int predicateStart;	/** When true, the intersection list has been exhausted (of course, if {@link #next} is not -1 there is still an element to be returned). */	private boolean exhausted;	/** The current element on which all iterators are aligned, unless {@link #exhausted} is true, in which case the value is undefined. */	private int curr;	/** Creates a new intersection iterator using a given array of iterators and a given index.	 *  @param index an index that will be passed to {@link AbstractCompositeDocumentIterator#AbstractCompositeDocumentIterator(Index, DocumentIterator...)}.	 *  @param documentIterator the iterators to be insersected (at least one).	 */	protected AbstractIntersectionDocumentIterator( final Index index, final DocumentIterator[] documentIterator ) throws IOException {		super( index, documentIterator ); 		if ( documentIterator.length == 0 ) throw new IllegalArgumentException();		sortedIterator = documentIterator.clone(); // We need a copy to reorder iterators				intervalIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() );		currentIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() );		unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators );				// We now reorder iterators putting in the back the index iterators of smallest frequency and moving to front payload-predicate iterators.		Arrays.sort( sortedIterator, new Comparator<DocumentIterator>() {			public int compare( final DocumentIterator d0, final DocumentIterator d1 ) {				final PayloadPredicateDocumentIterator p0 = d0 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d0 : null;				final PayloadPredicateDocumentIterator p1 = d1 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d1 : null;								if ( p0 != null && p1 != null ) return 0;				if ( p0 != null ) return -1;				if ( p1 != null ) return 1;								final IndexIterator i0 = d0 instanceof IndexIterator ? (IndexIterator)d0 : null;				final IndexIterator i1 = d1 instanceof IndexIterator ? (IndexIterator)d1 : null;				if ( i0 == null && i1 == null ) return 0;				if ( ( i0 != null ) != ( i1 != null ) ) return ( i0 != null ) ? 1 : -1;				try {					return i1.frequency() - i0.frequency();				}				catch ( IOException e ) {					throw new RuntimeException( e );				}			}		}		);		lastIterator = sortedIterator[ n - 1 ];		int i;		for( i = n; i-- != 0; ) if ( sortedIterator[ i ] instanceof PayloadPredicateDocumentIterator ) break;		predicateStart = i + 1;		payloadPredicateDocumentIterator = new PayloadPredicateDocumentIterator[ predicateStart ];		for( i = predicateStart; i-- != 0; ) payloadPredicateDocumentIterator[ i ] = (PayloadPredicateDocumentIterator)sortedIterator[ i ];				if ( DEBUG ) System.err.println( "Sorted iterators: " + Arrays.toString( sortedIterator ) );				/* We now advance all iterators to their first common pointer, if any. Note		 * that the difference between documentIterator and this.documentIterator		 * is immaterial here. */		for ( i = n; i-- != 0; ) 			if ( ! sortedIterator[ i ].hasNext() ) {				// If any of the iterators is empty, we're over.				exhausted = true;				return;			}		if ( align() ) next = curr;		else exhausted = true;	}	/** Creates a new intersection iterator using a given array of iterators.	 *  @param documentIterator the iterators to be insersected (at least one).	 */	protected AbstractIntersectionDocumentIterator( final DocumentIterator[] documentIterator ) throws IOException {		this( null, documentIterator );	}		/** Align all iterators on the first common document pointer after {@link #curr}.	 * 	 * <P>After a call to this method, all component iterators are positioned	 * on the same document (and {@link #curr} is set to that document), 	 * unless the method returns false, in which case there	 * are no more document pointer making alignment possible (and {@link #exhausted} is true).	 * 	 * @return true if all component iterators are aligned, false if	 * no alignment position could be found.	 */	private boolean align() throws IOException {		if ( ASSERTS ) assert ! exhausted;		if ( DEBUG ) System.err.println( this + ".align()..." );				int i, res;		int candidate = curr;				for(;;) {			for( i = n; i-- != 0 ; ) {				if ( i < predicateStart ) res = payloadPredicateDocumentIterator[ i ].skipUnconditionallyTo( candidate );				else res = sortedIterator[ i ].skipTo( candidate );				if ( res == Integer.MAX_VALUE ) return false;								if ( res != candidate ) {					// Note that for payload-predicate document iterators res might be negative					if ( res > candidate ) candidate = res;					else {						if ( ASSERTS ) assert res < 0;						candidate++;					}					break;				}			}						if ( i == -1 ) {				curr = candidate;				return true;			}		}	}	public int skipTo( final int n ) throws IOException {		if ( last >= n ) return last;		last = next = -1;		currentIterators.clear();		if ( curr < n ) {			if ( ( curr = lastIterator.skipTo( n ) ) == Integer.MAX_VALUE ) {				exhausted = true;				return Integer.MAX_VALUE;			}			else exhausted = ! align();		}		return exhausted ? Integer.MAX_VALUE : ( last = curr );	}		public int nextDocument() throws IOException {		if ( DEBUG ) System.err.println( this + ".hasNext()" );		if ( next >= 0 ) {			// We already know what to return			last = next;			next = -1;			return last;		}		last = next = -1;		currentIterators.clear();		if ( ( curr = lastIterator.nextDocument() ) == -1 ) exhausted = true;		else exhausted = ! align();		if ( exhausted ) return -1;		return last = curr;	}	public Reference2ReferenceMap<Index,IntervalIterator> intervalIterators() throws IOException {		final Iterator<Index> i = indices.iterator();		while( i.hasNext() ) intervalIterator( i.next() );		return unmodifiableCurrentIterators;	}	public IntervalIterator intervalIterator( final Index index ) throws IOException {		if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" );		if ( last == -1 ) throw new IllegalStateException(); 		if ( ! indices.contains( index ) ) return IntervalIterators.TRUE;		IntervalIterator intervalIterator;		// If the iterator has been created and it's ready, we just return it.				if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator;		int i, c;					/* None of the iterators may be FALSE. Otherwise, if all iterators are TRUE, we return TRUE. Otherwise, we return		 * (possibly after creation) the underlying interval iterator.		 * 		 * In the case of index iterators, we can avoid the check. No index iterator can return FALSE, and		 * at least one must return an iterator != TRUE (as indices.contains(index)).		 */		if ( indexIterator == null ) 			for( i = c = 0; i < n; i++ ) {				intervalIterator = documentIterator[ i ].intervalIterator( index );				// We cannot be on a document one of whose iterators if FALSE.				if ( intervalIterator == IntervalIterators.FALSE ) break;				if ( intervalIterator != IntervalIterators.TRUE ) c++;			}		else i = c = n;				// Note that we cannot optimise the case c == 1 because of gaps in ConsecutiveDocumentIterator.		if ( i < n ) intervalIterator = IntervalIterators.FALSE;		else if ( c == 0 ) intervalIterator = IntervalIterators.TRUE;		else {			intervalIterator = intervalIterators.get( index );			if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = getComposedIntervalIterator( index ) );			intervalIterator.reset();		}		currentIterators.put( index, intervalIterator );			return intervalIterator;	}		abstract protected IntervalIterator getComposedIntervalIterator( Index index );}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -