📄 abstractintersectiondocumentiterator.java
字号:
package it.unimi.dsi.mg4j.search;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import java.io.IOException;import java.util.Arrays;import java.util.Comparator;import java.util.Iterator;/** An abstract iterator on documents, generating the intersection of the documents returned by * a number of document iterators. * * <P>To be usable, this class must be subclassed so to provide also an iterator on intervals. * Such iterators must be instantiable using {@link #getComposedIntervalIterator(Index)}. * The latter is an example of a <em>non-static factory method</em>, that is, a factory method which * depends on the enclosing instance. This pattern allows to easily specialise this class to iterators * that do different things, such as {@link it.unimi.dsi.mg4j.search.AndDocumentIterator} and * {@link it.unimi.dsi.mg4j.search.ConsecutiveDocumentIterator}, but that have a similar semantics at the document * level (the semantics may in fact be slightly different: for instance, not all document belonging * to all components will actually appear in a consecutive iterator, as there may be documents filtered * at the interval level). * * <P>The important invariant is that <em>only</em> after a call to {@link #nextDocument()}, a call * to {@link #intervalIterator(Index)} will return an interval iterator over the document * just returned, and that for at least one index in {@link #indices()} the iterator will not be empty * or {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE}. * * <h2>The intersection algorithm</h2> * * <p>Since MG4J 1.1, this class implements a new intersection algorithm that should be significantly * faster than the previous one. The main idea is that of letting sparser iterator interact as much * as possible to obtain a candidate common document, and then trying to align the others. At construction * time, the component iterators are sorted so that index iterators are separated, and sorted by frequency. * Then, each time we have to align the iterators we align them greedily starting from the index * iterators, in frequency order. This has the effect of skipping very quickly (and usually by * large jumps, which are handled nicely by indices with skips), * as the main interaction happens between low-frequency index iterators. * * <p>Moreover, this class treats in a special way * {@linkplain PayloadPredicateDocumentIterator index iterators coming from payload-based indices}. Such * iterators are checked at the end of the alignment process, * after all standard index iterators (and general document iterators) * are aligned. At that point, the special method {@link PayloadPredicateDocumentIterator#skipUnconditionallyTo(int)} * is used to position unconditionally such iterators and check whether the payload predicate is satisfied. * If this doesn't happen, the current candidate (obtained by alignment of standard iterators) is increased and the * whole process is restarted. This procedure guarantees that we will never search exhaustively in a * payload-based index a document record satisfying the predicate (unless, of course, we have a query * containing just {@link PayloadPredicateDocumentIterator}s), which is very efficient if the payload-based * index uses skipping. * */public abstract class AbstractIntersectionDocumentIterator extends AbstractCompositeDocumentIterator { private final static boolean DEBUG = false; private final static boolean ASSERTS = false; /** A map from indices to interval iterators. */ final protected Reference2ReferenceArrayMap<Index,IntervalIterator> intervalIterators; /** A map from indices to the iterators returned for the current document. The key set may * not contain an index because the related iterator has never been requested. Moreover, * the iterator in this map for a given index may differ from the one in {@link #intervalIterators} * because it could be {@link IntervalIterators#TRUE TRUE} (in fact, in that case it may even * happen that {@link #intervalIterators} does not contain the index). */ final protected Reference2ReferenceArrayMap<Index,IntervalIterator> currentIterators; /** An unmodifiable wrapper around {@link #currentIterators}. */ final protected Reference2ReferenceMap<Index,IntervalIterator> unmodifiableCurrentIterators; /** The provided document iterators, suitably sorted. */ final private DocumentIterator[] sortedIterator; /** The last element of {@link #sortedIterator}. */ final private DocumentIterator lastIterator; /** The prefix of {@link #sortedIterator} made of {@link PayloadPredicateDocumentIterator}s. */ final private PayloadPredicateDocumentIterator[] payloadPredicateDocumentIterator; /** Iterators in {@link #sortedIterator} up to this position (exclusive) are instances of {@link PayloadPredicateDocumentIterator}. */ final private int predicateStart; /** When true, the intersection list has been exhausted (of course, if {@link #next} is not -1 there is still an element to be returned). */ private boolean exhausted; /** The current element on which all iterators are aligned, unless {@link #exhausted} is true, in which case the value is undefined. */ private int curr; /** Creates a new intersection iterator using a given array of iterators and a given index. * @param index an index that will be passed to {@link AbstractCompositeDocumentIterator#AbstractCompositeDocumentIterator(Index, DocumentIterator...)}. * @param documentIterator the iterators to be insersected (at least one). */ protected AbstractIntersectionDocumentIterator( final Index index, final DocumentIterator[] documentIterator ) throws IOException { super( index, documentIterator ); if ( documentIterator.length == 0 ) throw new IllegalArgumentException(); sortedIterator = documentIterator.clone(); // We need a copy to reorder iterators intervalIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() ); currentIterators = new Reference2ReferenceArrayMap<Index,IntervalIterator>( indices.size() ); unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators ); // We now reorder iterators putting in the back the index iterators of smallest frequency and moving to front payload-predicate iterators. Arrays.sort( sortedIterator, new Comparator<DocumentIterator>() { public int compare( final DocumentIterator d0, final DocumentIterator d1 ) { final PayloadPredicateDocumentIterator p0 = d0 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d0 : null; final PayloadPredicateDocumentIterator p1 = d1 instanceof PayloadPredicateDocumentIterator ? (PayloadPredicateDocumentIterator)d1 : null; if ( p0 != null && p1 != null ) return 0; if ( p0 != null ) return -1; if ( p1 != null ) return 1; final IndexIterator i0 = d0 instanceof IndexIterator ? (IndexIterator)d0 : null; final IndexIterator i1 = d1 instanceof IndexIterator ? (IndexIterator)d1 : null; if ( i0 == null && i1 == null ) return 0; if ( ( i0 != null ) != ( i1 != null ) ) return ( i0 != null ) ? 1 : -1; try { return i1.frequency() - i0.frequency(); } catch ( IOException e ) { throw new RuntimeException( e ); } } } ); lastIterator = sortedIterator[ n - 1 ]; int i; for( i = n; i-- != 0; ) if ( sortedIterator[ i ] instanceof PayloadPredicateDocumentIterator ) break; predicateStart = i + 1; payloadPredicateDocumentIterator = new PayloadPredicateDocumentIterator[ predicateStart ]; for( i = predicateStart; i-- != 0; ) payloadPredicateDocumentIterator[ i ] = (PayloadPredicateDocumentIterator)sortedIterator[ i ]; if ( DEBUG ) System.err.println( "Sorted iterators: " + Arrays.toString( sortedIterator ) ); /* We now advance all iterators to their first common pointer, if any. Note * that the difference between documentIterator and this.documentIterator * is immaterial here. */ for ( i = n; i-- != 0; ) if ( ! sortedIterator[ i ].hasNext() ) { // If any of the iterators is empty, we're over. exhausted = true; return; } if ( align() ) next = curr; else exhausted = true; } /** Creates a new intersection iterator using a given array of iterators. * @param documentIterator the iterators to be insersected (at least one). */ protected AbstractIntersectionDocumentIterator( final DocumentIterator[] documentIterator ) throws IOException { this( null, documentIterator ); } /** Align all iterators on the first common document pointer after {@link #curr}. * * <P>After a call to this method, all component iterators are positioned * on the same document (and {@link #curr} is set to that document), * unless the method returns false, in which case there * are no more document pointer making alignment possible (and {@link #exhausted} is true). * * @return true if all component iterators are aligned, false if * no alignment position could be found. */ private boolean align() throws IOException { if ( ASSERTS ) assert ! exhausted; if ( DEBUG ) System.err.println( this + ".align()..." ); int i, res; int candidate = curr; for(;;) { for( i = n; i-- != 0 ; ) { if ( i < predicateStart ) res = payloadPredicateDocumentIterator[ i ].skipUnconditionallyTo( candidate ); else res = sortedIterator[ i ].skipTo( candidate ); if ( res == Integer.MAX_VALUE ) return false; if ( res != candidate ) { // Note that for payload-predicate document iterators res might be negative if ( res > candidate ) candidate = res; else { if ( ASSERTS ) assert res < 0; candidate++; } break; } } if ( i == -1 ) { curr = candidate; return true; } } } public int skipTo( final int n ) throws IOException { if ( last >= n ) return last; last = next = -1; currentIterators.clear(); if ( curr < n ) { if ( ( curr = lastIterator.skipTo( n ) ) == Integer.MAX_VALUE ) { exhausted = true; return Integer.MAX_VALUE; } else exhausted = ! align(); } return exhausted ? Integer.MAX_VALUE : ( last = curr ); } public int nextDocument() throws IOException { if ( DEBUG ) System.err.println( this + ".hasNext()" ); if ( next >= 0 ) { // We already know what to return last = next; next = -1; return last; } last = next = -1; currentIterators.clear(); if ( ( curr = lastIterator.nextDocument() ) == -1 ) exhausted = true; else exhausted = ! align(); if ( exhausted ) return -1; return last = curr; } public Reference2ReferenceMap<Index,IntervalIterator> intervalIterators() throws IOException { final Iterator<Index> i = indices.iterator(); while( i.hasNext() ) intervalIterator( i.next() ); return unmodifiableCurrentIterators; } public IntervalIterator intervalIterator( final Index index ) throws IOException { if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" ); if ( last == -1 ) throw new IllegalStateException(); if ( ! indices.contains( index ) ) return IntervalIterators.TRUE; IntervalIterator intervalIterator; // If the iterator has been created and it's ready, we just return it. if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator; int i, c; /* None of the iterators may be FALSE. Otherwise, if all iterators are TRUE, we return TRUE. Otherwise, we return * (possibly after creation) the underlying interval iterator. * * In the case of index iterators, we can avoid the check. No index iterator can return FALSE, and * at least one must return an iterator != TRUE (as indices.contains(index)). */ if ( indexIterator == null ) for( i = c = 0; i < n; i++ ) { intervalIterator = documentIterator[ i ].intervalIterator( index ); // We cannot be on a document one of whose iterators if FALSE. if ( intervalIterator == IntervalIterators.FALSE ) break; if ( intervalIterator != IntervalIterators.TRUE ) c++; } else i = c = n; // Note that we cannot optimise the case c == 1 because of gaps in ConsecutiveDocumentIterator. if ( i < n ) intervalIterator = IntervalIterators.FALSE; else if ( c == 0 ) intervalIterator = IntervalIterators.TRUE; else { intervalIterator = intervalIterators.get( index ); if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = getComposedIntervalIterator( index ) ); intervalIterator.reset(); } currentIterators.put( index, intervalIterator ); return intervalIterator; } abstract protected IntervalIterator getComposedIntervalIterator( Index index );}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -