📄 bitstreamhpindexreader.c

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
#if defined(CLASSNAME) && ( ( GENERIC && ! ( #frequencies || #pointers || #counts || #positions ) ) || ( ( ! GENERIC ) && #frequencies && #pointers && #counts && #positions ) )#if GENERICpackage it.unimi.dsi.mg4j.index;#elsepackage it.unimi.dsi.mg4j.index.wired;#endif/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.ints.IntSet;import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;import it.unimi.dsi.fastutil.objects.ReferenceSet;import it.unimi.dsi.mg4j.index.AbstractIndexIterator;import it.unimi.dsi.mg4j.index.AbstractIndexReader;import it.unimi.dsi.mg4j.index.BitStreamHPIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.util.Interval;import it.unimi.dsi.mg4j.search.IntervalIterator;import it.unimi.dsi.mg4j.search.IntervalIterators;import it.unimi.dsi.bits.Fast;import it.unimi.dsi.Util;import java.io.IOException;import java.util.NoSuchElementException;import org.apache.log4j.Logger;#if GENERIC/** A bitstream-based {@linkplain IndexReader index reader} for {@linkplain BitStreamHPIndex high-performance indices}. */#elseimport it.unimi.dsi.mg4j.index.BitStreamIndex;#endifpublic class CLASSNAME extends AbstractIndexReader {	@SuppressWarnings("unused")	private static final Logger LOGGER = Util.getLogger( CLASSNAME.class );	private final static boolean ASSERTS = false;	private final static boolean DEBUG = false;	private final static boolean COOKIES = false;	/** The reference index. */	protected final BitStreamHPIndex index;	/** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */	protected final BitStreamHPIndexReaderIndexIterator indexIterator;	/**	 * Creates a new skip index reader, with the specified underlying {@link Index} and input bit	 * stream.	 * 	 * @param index the index.	 * @param ibs the underlying bit stream.	 */	public CLASSNAME( final BitStreamHPIndex index, final InputBitStream ibs, final InputBitStream positions ) {		this.index = index;		this.indexIterator = new BitStreamHPIndexReaderIndexIterator( this, ibs, positions );	}	protected static final class BitStreamHPIndexReaderIndexIterator extends AbstractIndexIterator implements IndexIterator {		/** The enclosing instance. */		private final CLASSNAME parent;		/** The reference index. */		protected final BitStreamHPIndex index;		/** The underlying input bit stream. */		protected final InputBitStream ibs;		/** The underlying positions input bit stream. */		private final InputBitStream positions;		/** The enclosed interval iterator. */		private final IndexIntervalIterator intervalIterator;		/** A singleton set containing the enclosed interval iterator. */		private final Reference2ReferenceMap<Index, IntervalIterator> singletonIntervalIterator;		/** The key index. */		private final Index keyIndex;		/** The cached copy of {@link #index index.pointerCoding}. */		protected final Coding pointerCoding;		/** The cached copy of {@link #index index.countCoding}. */		protected final Coding countCoding;		/** The cached copy of {@link #index index.positionCoding}. */		protected final Coding positionCoding;#if GENERIC || #pointers(GOLOMB)		/** The parameter <code>b</code> for Golomb coding of pointers. */		protected int b;		/**		 * The parameter <code>log2b</code> for Golomb coding of pointers; it is the most		 * significant bit of {@link #b}.		 */		protected int log2b;#endif		/** The current term. */		@SuppressWarnings("hiding")		protected int term = -1;		/** The current frequency. */		protected int frequency;		/**		 * Whether the current terms has pointers at all (this happens when the {@link #frequency}		 * is smaller than the number of documents).		 */		protected boolean hasPointers;		/** The current count (if this index contains counts). */		protected int count;		/**		 * The last document pointer we read from current list, -1 if we just read the frequency,		 * {@link Integer#MAX_VALUE} if we are beyond the end of list.		 */		protected int currentDocument;		/** The number of the document record we are going to read inside the current inverted list. */		protected int numberOfDocumentRecord;		/** This variable tracks the current state of the reader. */		protected int state;		/** The parameter <code>h</code> (the maximum height of a skip tower). */		public final int height;		/** The quantum. */		public final int quantum;		/** The bit mask giving the remainder of the division by {@link #quantum}. */		public final int quantumModuloMask;		/** The shift giving result of the division by {@link #quantum}. */		public final int quantumDivisionShift;		/**		 * The maximum height of a skip tower in the current block. May be less than {@link #height}		 * if the block is defective, and will be -1 on defective quanta (no tower at all).		 */		private int maxh;		/** The maximum valid index of the current skip tower, if any. */		private int s;		/**		 * The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If		 * {@link #maxh} is negative, the value is undefined.		 */		private int lowest;		/** We have <var>w</var> = <var>Hq</var>. */		private int w;		/** The bit mask giving the remainder of the division by {@link #w}. */		private final int wModuloMask;		/** The shift giving result of the division by {@link #w}. */		private final int wDivisionShift;		/** The Golomb modulus for a top pointer skip, for each level. */		private int[] towerTopB;		/** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */		private int[] towerTopLog2B;		/** The Golomb modulus for a lower pointer skip, for each level. */		private int[] towerLowerB;		/** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */		private int[] towerLowerLog2B;		/** The prediction for a pointer skip, for each level. */		private int[] pointerPrediction;		/** An array to decode bit skips. */		private long[] bitSkip;		/** An array to decode positions bit skips. */		private long[] positionsBitSkip;		/** An array to decode the pointer skips. */		private int[] pointerSkip;		/** The number of bits read just after reading the last skip tower. */		private long readBitsAtLastSkipTower;		/** The document pointer corresponding to the last skip tower. */		private int pointerAtLastSkipTower;		/** The number of positions that should be skipped manually		 * once the last read tower has been reached by skipping {@link #positionsToReadToReachCurrentPosition}. 		 */		private int positionsToReadToReachCurrentPosition;				/** The offset in the positions stream from the start of the current quantum		 * to the start of the last tower.		 */		private long positionsBitsOffset;				/** The last increment added to {@link #positionsBitSkip}. */		private long lastPositionsIncrement;				/** The current quantum bit length, as provided by the index. */		private int quantumBitLength;		/** The current positions quantum bit length, as provided by the index. */		private int positionsQuantumBitLength;		/** The current entry bit length, as provided by the index. */		private int entryBitLength;		/** This value of {@link #state} means that we are positioned just before a tower. */		private static final int BEFORE_TOWER = 0;		/**		 * This value of {@link #state} can be assumed only in indices that contain counts; it means		 * that we are positioned just before the count for the current document record.		 */		private static final int BEFORE_COUNT = 1;		/**		 * This value of {@link #state} means that we are at the start of a new document record,		 * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} ==		 * {@link #frequency}), in which case we are at the end of the inverted list, and		 * {@link #endOfList()} is true.		 */		private static final int BEFORE_POINTER = 2;		/** Whether the positions for the current document pointer have not been fetched yet. */		private boolean positionsUnread;				/** The cached position array. */		protected int[] positionCache = new int[ 16 ];		/** The offset of the positions of the current {@link #term}. */		protected long lastPositionsOffset;		public BitStreamHPIndexReaderIndexIterator( final CLASSNAME parent, final InputBitStream ibs, final InputBitStream positions ) {			this.parent = parent;			this.ibs = ibs;			this.positions = positions;			index = parent.index;			keyIndex = index.keyIndex;			pointerCoding = index.pointerCoding;			countCoding = index.countCoding;			positionCoding = index.positionCoding;			intervalIterator = index.hasPositions ? new IndexIntervalIterator() : null;			singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps.singleton( keyIndex, (IntervalIterator)intervalIterator ) : null;			quantum = index.quantum;			quantumModuloMask = quantum - 1;			quantumDivisionShift = Fast.mostSignificantBit( quantum );			height = index.height;			w = ( 1 << height ) * quantum;			wModuloMask = w - 1;			wDivisionShift = Fast.mostSignificantBit( w );			bitSkip = new long[ height + 1 ];			positionsBitSkip = new long[ height + 1 ];			pointerSkip = new int[ height + 1 ];			towerTopB = new int[ height + 1 ];			towerTopLog2B = new int[ height + 1 ];			towerLowerB = new int[ height + 1 ];			towerLowerLog2B = new int[ height + 1 ];			pointerPrediction = new int[ height + 1 ];		}		/** Debug variable, usually unused, that keeps track of the end of the positions stream for the current term (but not for term 0!). */		private long positionsLength;		/**		 * Positions the index on the inverted list of a given term.		 * 		 * <p>This method can be called at any time. Note that it is <em>always</em> possible to		 * call this method with argument 0, even if offsets have not been loaded.		 * 		 * @param term a term.		 */		protected void position( final int term ) throws IOException {			if ( term == 0 ) {				if ( ASSERTS ) {					// Get end of positions					ibs.position( index.offsets.getLong( 1 ) );					positionsLength = ibs.readLongDelta();				}				ibs.position( 0 );				ibs.readBits( 0 );				lastPositionsOffset = ibs.readLongDelta(); // This is 0 for sure 				positions.position( 0 );			}			else {				if ( index.offsets == null ) throw new IllegalStateException( "You cannot position an index without offsets" );				if ( ASSERTS ) {					// Get end of positions					if ( term < index.numberOfTerms - 1 ) {						ibs.position( index.offsets.getLong( term + 1 ) );						positionsLength = ibs.readLongDelta();					}					else positionsLength = Long.MAX_VALUE; // Presently, no way to check				}				ibs.position( index.offsets.getLong( term ) );				ibs.readBits( index.offsets.getLong( term ) );				// Let us position the positions bitstream				lastPositionsOffset = ibs.readLongDelta(); 				positions.position( lastPositionsOffset );				if ( ASSERTS && positionsLength != Long.MAX_VALUE ) positionsLength -= lastPositionsOffset;				if ( DEBUG && ASSERTS ) System.err.println( this + ": positions for term " + term + " start at bit " + lastPositionsOffset + " (" + positionsLength + " bits)" );			}			positions.readBits( 0 );			this.term = term;			readFrequency();		}		public int termNumber() {			return term;		}				protected IndexIterator advance() throws IOException {			if ( term == index.numberOfTerms - 1 ) return null;			if ( term != -1 ) {				skipTo( Integer.MAX_VALUE );				// This guarantees we have no garbage before the frequency				nextDocument();				positionsUnread = false;			}			// We must skip the offset into the positions bitstream			final long nextPositionsOffset = ibs.readLongDelta();			positions.skip( nextPositionsOffset - lastPositionsOffset - positions.readBits() );			lastPositionsOffset = nextPositionsOffset;			positions.readBits( 0 );			term++;			if ( ASSERTS ) positionsLength = -1; // Invalidate			readFrequency();			return this;		}
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -