📄 bitstreamhpindexreader.c
字号:
#if defined(CLASSNAME) && ( ( GENERIC && ! ( #frequencies || #pointers || #counts || #positions ) ) || ( ( ! GENERIC ) && #frequencies && #pointers && #counts && #positions ) )#if GENERICpackage it.unimi.dsi.mg4j.index;#elsepackage it.unimi.dsi.mg4j.index.wired;#endif/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.ints.IntSet;import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;import it.unimi.dsi.fastutil.objects.ReferenceSet;import it.unimi.dsi.mg4j.index.AbstractIndexIterator;import it.unimi.dsi.mg4j.index.AbstractIndexReader;import it.unimi.dsi.mg4j.index.BitStreamHPIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.util.Interval;import it.unimi.dsi.mg4j.search.IntervalIterator;import it.unimi.dsi.mg4j.search.IntervalIterators;import it.unimi.dsi.bits.Fast;import it.unimi.dsi.Util;import java.io.IOException;import java.util.NoSuchElementException;import org.apache.log4j.Logger;#if GENERIC/** A bitstream-based {@linkplain IndexReader index reader} for {@linkplain BitStreamHPIndex high-performance indices}. */#elseimport it.unimi.dsi.mg4j.index.BitStreamIndex;#endifpublic class CLASSNAME extends AbstractIndexReader { @SuppressWarnings("unused") private static final Logger LOGGER = Util.getLogger( CLASSNAME.class ); private final static boolean ASSERTS = false; private final static boolean DEBUG = false; private final static boolean COOKIES = false; /** The reference index. */ protected final BitStreamHPIndex index; /** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */ protected final BitStreamHPIndexReaderIndexIterator indexIterator; /** * Creates a new skip index reader, with the specified underlying {@link Index} and input bit * stream. * * @param index the index. * @param ibs the underlying bit stream. */ public CLASSNAME( final BitStreamHPIndex index, final InputBitStream ibs, final InputBitStream positions ) { this.index = index; this.indexIterator = new BitStreamHPIndexReaderIndexIterator( this, ibs, positions ); } protected static final class BitStreamHPIndexReaderIndexIterator extends AbstractIndexIterator implements IndexIterator { /** The enclosing instance. */ private final CLASSNAME parent; /** The reference index. */ protected final BitStreamHPIndex index; /** The underlying input bit stream. */ protected final InputBitStream ibs; /** The underlying positions input bit stream. */ private final InputBitStream positions; /** The enclosed interval iterator. */ private final IndexIntervalIterator intervalIterator; /** A singleton set containing the enclosed interval iterator. */ private final Reference2ReferenceMap<Index, IntervalIterator> singletonIntervalIterator; /** The key index. */ private final Index keyIndex; /** The cached copy of {@link #index index.pointerCoding}. */ protected final Coding pointerCoding; /** The cached copy of {@link #index index.countCoding}. */ protected final Coding countCoding; /** The cached copy of {@link #index index.positionCoding}. */ protected final Coding positionCoding;#if GENERIC || #pointers(GOLOMB) /** The parameter <code>b</code> for Golomb coding of pointers. */ protected int b; /** * The parameter <code>log2b</code> for Golomb coding of pointers; it is the most * significant bit of {@link #b}. */ protected int log2b;#endif /** The current term. */ @SuppressWarnings("hiding") protected int term = -1; /** The current frequency. */ protected int frequency; /** * Whether the current terms has pointers at all (this happens when the {@link #frequency} * is smaller than the number of documents). */ protected boolean hasPointers; /** The current count (if this index contains counts). */ protected int count; /** * The last document pointer we read from current list, -1 if we just read the frequency, * {@link Integer#MAX_VALUE} if we are beyond the end of list. */ protected int currentDocument; /** The number of the document record we are going to read inside the current inverted list. */ protected int numberOfDocumentRecord; /** This variable tracks the current state of the reader. */ protected int state; /** The parameter <code>h</code> (the maximum height of a skip tower). */ public final int height; /** The quantum. */ public final int quantum; /** The bit mask giving the remainder of the division by {@link #quantum}. */ public final int quantumModuloMask; /** The shift giving result of the division by {@link #quantum}. */ public final int quantumDivisionShift; /** * The maximum height of a skip tower in the current block. May be less than {@link #height} * if the block is defective, and will be -1 on defective quanta (no tower at all). */ private int maxh; /** The maximum valid index of the current skip tower, if any. */ private int s; /** * The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If * {@link #maxh} is negative, the value is undefined. */ private int lowest; /** We have <var>w</var> = <var>Hq</var>. */ private int w; /** The bit mask giving the remainder of the division by {@link #w}. */ private final int wModuloMask; /** The shift giving result of the division by {@link #w}. */ private final int wDivisionShift; /** The Golomb modulus for a top pointer skip, for each level. */ private int[] towerTopB; /** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */ private int[] towerTopLog2B; /** The Golomb modulus for a lower pointer skip, for each level. */ private int[] towerLowerB; /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */ private int[] towerLowerLog2B; /** The prediction for a pointer skip, for each level. */ private int[] pointerPrediction; /** An array to decode bit skips. */ private long[] bitSkip; /** An array to decode positions bit skips. */ private long[] positionsBitSkip; /** An array to decode the pointer skips. */ private int[] pointerSkip; /** The number of bits read just after reading the last skip tower. */ private long readBitsAtLastSkipTower; /** The document pointer corresponding to the last skip tower. */ private int pointerAtLastSkipTower; /** The number of positions that should be skipped manually * once the last read tower has been reached by skipping {@link #positionsToReadToReachCurrentPosition}. */ private int positionsToReadToReachCurrentPosition; /** The offset in the positions stream from the start of the current quantum * to the start of the last tower. */ private long positionsBitsOffset; /** The last increment added to {@link #positionsBitSkip}. */ private long lastPositionsIncrement; /** The current quantum bit length, as provided by the index. */ private int quantumBitLength; /** The current positions quantum bit length, as provided by the index. */ private int positionsQuantumBitLength; /** The current entry bit length, as provided by the index. */ private int entryBitLength; /** This value of {@link #state} means that we are positioned just before a tower. */ private static final int BEFORE_TOWER = 0; /** * This value of {@link #state} can be assumed only in indices that contain counts; it means * that we are positioned just before the count for the current document record. */ private static final int BEFORE_COUNT = 1; /** * This value of {@link #state} means that we are at the start of a new document record, * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} == * {@link #frequency}), in which case we are at the end of the inverted list, and * {@link #endOfList()} is true. */ private static final int BEFORE_POINTER = 2; /** Whether the positions for the current document pointer have not been fetched yet. */ private boolean positionsUnread; /** The cached position array. */ protected int[] positionCache = new int[ 16 ]; /** The offset of the positions of the current {@link #term}. */ protected long lastPositionsOffset; public BitStreamHPIndexReaderIndexIterator( final CLASSNAME parent, final InputBitStream ibs, final InputBitStream positions ) { this.parent = parent; this.ibs = ibs; this.positions = positions; index = parent.index; keyIndex = index.keyIndex; pointerCoding = index.pointerCoding; countCoding = index.countCoding; positionCoding = index.positionCoding; intervalIterator = index.hasPositions ? new IndexIntervalIterator() : null; singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps.singleton( keyIndex, (IntervalIterator)intervalIterator ) : null; quantum = index.quantum; quantumModuloMask = quantum - 1; quantumDivisionShift = Fast.mostSignificantBit( quantum ); height = index.height; w = ( 1 << height ) * quantum; wModuloMask = w - 1; wDivisionShift = Fast.mostSignificantBit( w ); bitSkip = new long[ height + 1 ]; positionsBitSkip = new long[ height + 1 ]; pointerSkip = new int[ height + 1 ]; towerTopB = new int[ height + 1 ]; towerTopLog2B = new int[ height + 1 ]; towerLowerB = new int[ height + 1 ]; towerLowerLog2B = new int[ height + 1 ]; pointerPrediction = new int[ height + 1 ]; } /** Debug variable, usually unused, that keeps track of the end of the positions stream for the current term (but not for term 0!). */ private long positionsLength; /** * Positions the index on the inverted list of a given term. * * <p>This method can be called at any time. Note that it is <em>always</em> possible to * call this method with argument 0, even if offsets have not been loaded. * * @param term a term. */ protected void position( final int term ) throws IOException { if ( term == 0 ) { if ( ASSERTS ) { // Get end of positions ibs.position( index.offsets.getLong( 1 ) ); positionsLength = ibs.readLongDelta(); } ibs.position( 0 ); ibs.readBits( 0 ); lastPositionsOffset = ibs.readLongDelta(); // This is 0 for sure positions.position( 0 ); } else { if ( index.offsets == null ) throw new IllegalStateException( "You cannot position an index without offsets" ); if ( ASSERTS ) { // Get end of positions if ( term < index.numberOfTerms - 1 ) { ibs.position( index.offsets.getLong( term + 1 ) ); positionsLength = ibs.readLongDelta(); } else positionsLength = Long.MAX_VALUE; // Presently, no way to check } ibs.position( index.offsets.getLong( term ) ); ibs.readBits( index.offsets.getLong( term ) ); // Let us position the positions bitstream lastPositionsOffset = ibs.readLongDelta(); positions.position( lastPositionsOffset ); if ( ASSERTS && positionsLength != Long.MAX_VALUE ) positionsLength -= lastPositionsOffset; if ( DEBUG && ASSERTS ) System.err.println( this + ": positions for term " + term + " start at bit " + lastPositionsOffset + " (" + positionsLength + " bits)" ); } positions.readBits( 0 ); this.term = term; readFrequency(); } public int termNumber() { return term; } protected IndexIterator advance() throws IOException { if ( term == index.numberOfTerms - 1 ) return null; if ( term != -1 ) { skipTo( Integer.MAX_VALUE ); // This guarantees we have no garbage before the frequency nextDocument(); positionsUnread = false; } // We must skip the offset into the positions bitstream final long nextPositionsOffset = ibs.readLongDelta(); positions.skip( nextPositionsOffset - lastPositionsOffset - positions.readBits() ); lastPositionsOffset = nextPositionsOffset; positions.readBits( 0 ); term++; if ( ASSERTS ) positionsLength = -1; // Invalidate readFrequency(); return this; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -