📄 documentiterator.java
字号:
package it.unimi.dsi.mg4j.search;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.ReferenceSet;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;import it.unimi.dsi.util.Interval;import java.io.IOException;import java.util.Map;import java.util.NoSuchElementException;/** An iterator over documents (pointers) and their intervals. * * <p><strong>Warning:</strong> the semantics of {@link #nextDocument()} has changed significantly * in MG4J 1.2. * * <p><strong>Warning</strong>: from MG4J 1.2, most methods throw an {@link IOException} * (such exceptions used to be catched and wrapped into a {@link RuntimeException}). * * <p><strong>Warning:</strong> the semantics of {@link #skipTo(int)} has changed significantly * in MG4J 1.1. * * <P>Each call to {@link #nextDocument()} * will return a document pointer, or -1 if no more documents are available. Just * after the call to {@link #nextDocument()}, {@link #intervalIterator(Index)} will return an interval iterator * enumerating intervals in the last returned document for the specified index. The latter method may return, as a special result, a * special {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE} value: this means that * albeit the current document satisfies the query, there is only a generic * empty witness to prove it (see {@link it.unimi.dsi.mg4j.search.IntervalIterators#TRUE TRUE} for some elaboration). * * <p>Note that this class implements {@link IntIterator}. Nonetheless, for performance reasons, * the preferred access to the document pointers is {@link #nextDocument()}. * * <P>The {@link #iterator()} method <strong>must</strong> be an alias for {@link #intervalIterator()}, and shares * the same limitations. * * <p>A document iterator is usually structured as composite, * with operators as internal nodes and {@link it.unimi.dsi.mg4j.index.IndexIterator}s * as leaves. The methods {@link #accept(DocumentIteratorVisitor)} * and {@link #acceptOnTruePaths(DocumentIteratorVisitor)} implement the visitor pattern. * * <p>The {@link #dispose()} method is intended to recursively release all resources associated * to a composite document iterator. Note that this is not always what you want, as you might * be, say, pooling {@linkplain it.unimi.dsi.mg4j.index.IndexReader index readers} to reduce the number * of file open/close operations. For this reason, we intentionally avoid calling the method “close”. * * <p><strong>Warning:</strong> the interval enumeration can be carried out only just after a call * to {@link #nextDocument()}. Subsequent calls to {@link #nextDocument()} <em>or even to {@link java.util.Iterator#hasNext()}</em> * will reset the internal state of the iterator. In particular, trying to enumerate intervals after a call * to {@link java.util.Iterator#hasNext()} will usually throw an {@link java.lang.IllegalStateException}. */public interface DocumentIterator extends IntIterator, Iterable<Interval> { /** Returns the interval iterator of this document iterator for single-index queries. * * <P>This is a commodity method that can be used only for queries * built over a single index. * * @return an interval iterator. * @see #intervalIterator(Index) * @throws IllegalStateException if this document iterator is not built on a single index. */ public IntervalIterator intervalIterator() throws IOException; /** Returns the interval iterator of this document iterator for the given index. * * <P>After a call to {@link #nextDocument()}, this iterator * can be used to retrieve the intervals in the current document (the * one returned by {@link #nextDocument()}) for * the index <code>index</code>. * * <P>Note that if all indices have positions, * it is guaranteed that at least one index will return an interval. * However, for disjunctive queries it cannot be guaranteed that <em>all</em> * indices will return an interval. * * <p>Indices without positions always return {@link IntervalIterators#TRUE}. * Thus, in presence of indices without positions it is possible that no * intervals at all are available. * * @param index an index (must be one over which the query was built). * @return an interval iterator over the current document in <code>index</code>. */ public IntervalIterator intervalIterator( Index index ) throws IOException; /** Returns an unmodifiable map from indices to interval iterators. * * <P>After a call to {@link #nextDocument()}, this map * can be used to retrieve the intervals in the current document. An invocation of {@link Map#get(java.lang.Object)} * on this map with argument <code>index</code> yields the same result as * {@link #intervalIterator(Index) intervalIterator(index)}. * * @return a map from indices to interval iterators over the current document. * @throws UnsupportedOperationException if this index does not contain positions. * @see #intervalIterator(Index) */ public Reference2ReferenceMap<Index,IntervalIterator> intervalIterators() throws IOException; /** Returns the set of indices over which this iterator is built. * * @return the set of indices over which this iterator is built. */ public ReferenceSet<Index> indices(); /** Returns the next document. * * @deprecated As of MG4J 1.2, the suggested way of iterating over document iterators * is {@link #nextDocument()}, which has been modified so to provide fully lazy * iteration. After a couple of releases, however, this annotation will be removed, as it * is very practical to have document iterators implementing {@link IntIterator}. Its * main purpose is to warn people about performance issues solved by {@link #nextDocument()}. * @see #nextDocument() */ @Deprecated public int nextInt(); /** Returns the next document provided by this document iterator, or -1 if no more documents are available. * * <p><strong>Warning</strong>: the specification of this method has significantly changed as of MG4J 1.2. * The special return value -1 is used to mark the end of iteration (a {@link NoSuchElementException} * would have been thrown before in that case, so ho harm should be caused by this change). The reason * for this change is providing <em>fully lazy</em> iteration over documents. Fully lazy iteration * does not provide an <code>hasNext()</code> method—you have to actually ask for the next * element and check the return value. Fully lazy iteration is much lighter on method calls (half) and * in most (if not all) MG4J classes leads to a much simpler logic. Moreover, {@link #nextDocument()} * can be specified as throwing an {@link IOException}, which avoids the pernicious proliferation * of try/catch blocks in very short, low-level methods (it was having a detectable impact on performance). * * @return the next document, or -1 if no more documents are available. */ public int nextDocument() throws IOException; /** Returns the last document returned by {@link #nextDocument()}. * * @return the last document returned by {@link #nextDocument()}, or -1 if no document has been returned yet. */ public int document(); /** Skips all documents smaller than <code>n</code>. * * <P>Define the <em>current document</em> <code>k</code> associated with this document iterator * as follows: * <ul> * <li>-1, if {@link #nextDocument()} and this method have never been called; * <li>{@link Integer#MAX_VALUE}, if a call to this method returned {@link Integer#MAX_VALUE}; * <li>the last value returned by a call to {@link #nextDocument()} or this method, otherwise. * </ul> * * <p>If <code>k</code> is larger than or equal to <code>n</code>, then * this method does nothing and returns <code>k</code>. Otherwise, a * call to this method is equivalent to * <pre> * while( ( k = nextDocument() ) < n && k != -1 ); * return k == -1 ? Integer.MAX_VALUE : k; * </pre> * * <P>Thus, when a result <code>k</code> ≠ {@link Integer#MAX_VALUE} * is returned, the state of this iterator * will be exactly the same as after a call to {@link #nextDocument()} * that returned <code>k</code>. * In particular, the first document larger than or equal to <code>n</code> (when returned * by this method) will <em>not</em> be returned by the next call to * {@link #nextDocument()}. * * @param n a document pointer. * @return a document pointer larger than or equal to <code>n</code> if available, {@link Integer#MAX_VALUE} * otherwise. */ int skipTo( int n ) throws IOException; /** Accepts a visitor. * * <p>A document iterator is usually structured as composite, * with operators as internal nodes and {@link it.unimi.dsi.mg4j.index.IndexIterator}s * as leaves. This method implements the visitor pattern. * * @param visitor the visitor. * @return true if the visit should continue. */ boolean accept( DocumentIteratorVisitor visitor ) throws IOException; /** Accepts a visitor after a call to {@link #nextDocument()}, * limiting recursion to true paths. * * <p>After a call to {@link #nextDocument()}, a document iterator * is positioned over a document. This call is equivalent to {@link #accept(DocumentIteratorVisitor)}, * but visits only along <em>true paths</em>. * * <p>We define a <em>true path</em> as a path from the root of the composite that passes only through * nodes whose associated subtree is positioned on the same document of the root. Note that {@link OrDocumentIterator}s * detach exhausted iterators from the composite tree, so true paths define the subtree that is causing * the current document to satisfy the query represented by this document iterator. * * <p>For more elaboration, and the main application of this method, see {@link it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor}. * * @param visitor the visitor. * @return true if the visit should continue. * @see #accept(DocumentIteratorVisitor) * @see it.unimi.dsi.mg4j.search.visitor.CounterCollectionVisitor */ boolean acceptOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException; /** Disposes this document iterator, releasing all resources. * * <p>This method should propagate down to the underlying index iterators, where it should release resources * such as open files and network connections. If you're doing your own resource tracking and pooling, * then you do not need to call this method. */ void dispose() throws IOException; /** An alias for {@link #intervalIterator()}, that has the same limitations (i.e., it will work only if * there is just one index), and that catches {@link IOException}s. * * @return an interval iterator. */ IntervalIterator iterator(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -