📄 bitstreamindexreader.c

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 C
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
#if defined(CLASSNAME) && ( ( GENERIC && ! ( SKIPS || PAYLOADS || #frequencies || #pointers || #counts || #positions ) ) || ( ( ! GENERIC ) && #frequencies && #pointers && #counts && #positions(NONE) ) || ( ( ! GENERIC ) && #frequencies && #pointers && #counts && ( ! #counts(NONE) ) && #positions ) )#if GENERICpackage it.unimi.dsi.mg4j.index;#elsepackage it.unimi.dsi.mg4j.index.wired;#endif/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2006 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.ints.IntSet;import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;import it.unimi.dsi.fastutil.objects.ReferenceSet;import it.unimi.dsi.mg4j.index.AbstractIndexIterator;import it.unimi.dsi.mg4j.index.AbstractIndexReader;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.util.Interval;import it.unimi.dsi.mg4j.search.IntervalIterator;import it.unimi.dsi.mg4j.search.IntervalIterators;import it.unimi.dsi.bits.Fast;import it.unimi.dsi.Util;import java.io.IOException;import java.util.NoSuchElementException;import org.apache.log4j.Logger;#if GENERIC/** A bitstream-based {@linkplain IndexReader index reader}. */#endifpublic class CLASSNAME extends AbstractIndexReader {	@SuppressWarnings("unused")	private static final Logger LOGGER = Util.getLogger( CLASSNAME.class );	/** The reference index. */	protected final BitStreamIndex index;	private final static boolean ASSERTS = false;	private final static boolean DEBUG = false;	/** The {@link IndexIterator} view of this reader (returned by {@link #documents(CharSequence)}). */	protected final BitStreamIndexReaderIndexIterator indexIterator;		/** Creates a new skip index reader, with the specified underlying {@link Index} and input bit stream.	 *	 * @param index the index.	 * @param ibs the underlying bit stream.	 */	public CLASSNAME( final BitStreamIndex index, final InputBitStream ibs ) {		this.index = index;		this.indexIterator = new BitStreamIndexReaderIndexIterator( this, ibs );	}		protected static final class BitStreamIndexReaderIndexIterator extends AbstractIndexIterator implements IndexIterator {		/** The enclosing instance. */		private final CLASSNAME parent;		/** The reference index. */		protected final BitStreamIndex index;		/** The underlying input bit stream. */		protected final InputBitStream ibs;		/** The enclosed interval iterator. */		private final IndexIntervalIterator intervalIterator;		/** A singleton set containing the enclosed interval iterator. */		private final Reference2ReferenceMap<Index,IntervalIterator> singletonIntervalIterator;		/** The key index. */		private final Index keyIndex;#if GENERIC		/** The cached copy of {@link #index index.hasPositions}. */		protected final boolean hasPositions;		/** The cached copy of {@link #index index.hasCounts}. */		protected final boolean hasCounts;		/** The cached copy of {@link #index index.hasPayloads}. */		protected final boolean hasPayloads;		/** Whether the underlying index has skips. */		protected final boolean hasSkips;#endif		/** The cached copy of {@link #index index.pointerCoding}. */		protected final Coding pointerCoding;#if GENERIC || ! #counts(NONE)		/** The cached copy of {@link #index index.countCoding}. */		protected final Coding countCoding;#endif#if GENERIC || ! #positions(NONE)		/** The cached copy of {@link #index index.positionCoding}. */		protected final Coding positionCoding;#endif#if GENERIC || PAYLOADS		/** The payload, in case the index of this reader has payloads, or <code>null</code>. */		protected final Payload payload;#endif#if GENERIC || #pointers(GOLOMB)		/** The parameter <code>b</code> for Golomb coding of pointers. */		protected int b;		/** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */		protected int log2b;#endif		/** The current term. */		protected int currentTerm = -1;		/** The current frequency. */		protected int frequency;		/** Whether the current terms has pointers at all (this happens when the {@link #frequency} is smaller than the number of documents). */		protected boolean hasPointers;		/** The current count (if this index contains counts). */		protected int count;		/** The last document pointer we read from current list, -1 if we just read the frequency,		 * {@link Integer#MAX_VALUE} if we are beyond the end of list. */		protected int currentDocument;		/** The number of the document record we are going to read inside the current inverted list. */		protected int numberOfDocumentRecord;		/** This variable tracks the current state of the reader. */		protected int state;#if GENERIC || SKIPS		/** The parameter <code>h</code> (the maximum height of a skip tower). */		public final int height;		/** The quantum. */		public final int quantum;		/** The bit mask giving the remainder of the division by {@link #quantum}. */		public final int quantumModuloMask;		/** The shift giving result of the division by {@link #quantum}. */		public final int quantumDivisionShift;		/** The maximum height of a skip tower in the current block. May be less than {@link #height} if the block is defective,		 * and will be -1 on defective quanta (no tower at all). */		private int maxh;		/** The maximum valid index of the current skip tower, if any. */		private int s;		/** The minimum valid index of the current skip tower, or {@link Integer#MAX_VALUE}. If {@link #maxh} is negative, the value is undefined. */		private int lowest;		/** We have <var>w</var> = <var>Hq</var>. */		private final int w;		/** The bit mask giving the remainder of the division by {@link #w}. */		private final int wModuloMask;		/** The shift giving result of the division by {@link #w}. */		private final int wDivisionShift;		/** The Golomb modulus for a top pointer skip, for each level. */		private int[] towerTopB;		/** The most significant bit of the Golomb modulus for a top point[]er skip, for each level. */		private int[] towerTopLog2B;		/** The Golomb modulus for a lower pointer skip, for each level. */		private int[] towerLowerB;		/** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */		private int[] towerLowerLog2B;		/** The prediction for a pointer skip, for each level. */		private int[] pointerPrediction;		/** An array to decode bit skips. */		private long[] bitSkip;		/** An array to decode the pointer skips. */		private int[] pointerSkip;		/** The number of bits read just after reading the last skip tower. */		private long readBitsAtLastSkipTower;		/** The document pointer corresponding to the last skip tower. */		private int pointerAtLastSkipTower;		/** The current quantum bit length, as provided by the index. */		private int quantumBitLength;		/** The current entry bit length, as provided by the index. */		private int entryBitLength;		/** This value of {@link #state} means that we are positioned just before a tower. */		private static final int BEFORE_TOWER = 0;#endif		/** This value of {@link #state} can be assumed only in indices that contain a payload; it		 * means that we are positioned just before the payload for the current document record. */		private static final int BEFORE_PAYLOAD = 1;		/** This value of {@link #state} can be assumed only in indices that contain counts; it		 * means that we are positioned just before the count for the current document record. */		private static final int BEFORE_COUNT = 2;		/** This value of {@link #state} can be assumed only in indices that contain document positions; 		 * it means that we are positioned just before the position list of the current document record. */		private static final int BEFORE_POSITIONS = 3;		/** This value of {@link #state} means that we are at the start of a new document record, 		 * unless we already read all documents (i.e., {@link #numberOfDocumentRecord} == {@link #frequency}),		 * in which case we are at the end of the inverted list, and {@link #endOfList()} is true. */		private static final int BEFORE_POINTER = 4;		/** The cached position array. */		protected int[] positionCache = new int[ 16 ];				public BitStreamIndexReaderIndexIterator( final CLASSNAME parent, final InputBitStream ibs ) {			this.parent = parent;			this.ibs = ibs;			index = parent.index;			keyIndex = index.keyIndex;			pointerCoding = index.pointerCoding;#if GENERIC			hasPayloads = index.hasPayloads;			payload = hasPayloads ? index.payload.copy() : null;#elif PAYLOADS			if ( ! index.hasPayloads ) throw new IllegalStateException();			payload = index.payload.copy();#else			if ( index.hasPayloads ) throw new IllegalStateException();#endif#if GENERIC			hasCounts = index.hasCounts;			countCoding = index.countCoding;#elif #counts(NONE)			if ( index.hasCounts ) throw new IllegalStateException();#else			if ( ! index.hasCounts ) throw new IllegalStateException();			countCoding = index.countCoding;#endif#if GENERIC			hasPositions = index.hasPositions;			positionCoding = index.positionCoding;#elif #positions(NONE)			if ( index.hasPositions ) throw new IllegalStateException();#else			if ( ! index.hasPositions ) throw new IllegalStateException();			positionCoding = index.positionCoding;#endif			intervalIterator = index.hasPositions ? new IndexIntervalIterator() : null;			singletonIntervalIterator = index.hasPositions ? Reference2ReferenceMaps.singleton( keyIndex, (IntervalIterator)intervalIterator ) : null;#if GENERIC || SKIPS			quantum = index.quantum;			height = index.height;			if ( ( quantum == -1 ) != ( height == -1 ) ) throw new IllegalArgumentException();#endif#if GENERIC			hasSkips = quantum != -1 && height != -1;			if ( hasSkips ) {#endif#if GENERIC || SKIPS				w = ( 1 << height ) * quantum;				quantumModuloMask = quantum - 1;				wModuloMask = w - 1;						quantumDivisionShift = Fast.mostSignificantBit( quantum );				wDivisionShift = Fast.mostSignificantBit( w );				bitSkip = new long[ height + 1 ];				pointerSkip = new int[ height + 1 ];				towerTopB = new int[ height + 1 ];				towerTopLog2B = new int[ height  + 1 ];				towerLowerB = new int[ height + 1 ];				towerLowerLog2B = new int[ height  + 1 ];				pointerPrediction = new int[ height  + 1 ];#endif#if GENERIC			}			else {				w = quantumModuloMask = wModuloMask = quantumDivisionShift = wDivisionShift = 0;				bitSkip = null;				pointerSkip = towerTopB = towerTopLog2B = towerLowerB = towerLowerLog2B = pointerPrediction = null;			}#endif		}				/** Positions the index on the inverted list of a given term.		 *		 * <p>This method can be called at any time. Note that it is <em>always</em> possible		 * to call this method with argument 0, even if offsets have not been loaded.		 *		 * @param term a term.		 */		protected void position( final int term ) throws IOException {			if ( term == 0 ) {				ibs.position(  0 );				ibs.readBits( 0 );				}			else {				if ( index.offsets == null ) throw new IllegalStateException( "You cannot position an index without offsets" );				final long offset = index.offsets.getLong( term );				ibs.position( offset );				// TODO: Can't we set this to 0?				ibs.readBits( offset );			}			currentTerm = term;			readFrequency();		}		public int termNumber() {			return currentTerm;		}				protected IndexIterator advance() throws IOException {			if ( currentTerm == index.numberOfTerms - 1 ) return null;			if ( currentTerm != -1 ) {				skipTo( Integer.MAX_VALUE );				nextDocument(); // This guarantees we have no garbage before the frequency			}			currentTerm++;			readFrequency();			return this;		}				private void readFrequency() throws IOException {						// Read the frequency#if GENERIC			switch( index.frequencyCoding ) {			case GAMMA:#endif#if GENERIC || #frequencies(GAMMA)				frequency = ibs.readGamma() + 1; #endif#if GENERIC				break;			case SHIFTED_GAMMA:#endif#if GENERIC || #frequencies(SHIFTED_GAMMA)				frequency = ibs.readShiftedGamma() + 1;#endif#if GENERIC				break;			case DELTA:#endif#if GENERIC || #frequencies(DELTA)				frequency = ibs.readDelta() + 1;#endif#if GENERIC				break;			default:				throw new IllegalStateException( "The required frequency coding (" + index.frequencyCoding + ") is not supported." );			}#endif							hasPointers = frequency < index.numberOfDocuments;		#if GENERIC				// We compute the modulus used for pointer Golomb coding 			if ( pointerCoding == Coding.GOLOMB ) {#endif#if GENERIC || #pointers(GOLOMB)				if ( hasPointers ) {					b = BitStreamIndex.golombModulus( frequency, index.numberOfDocuments );					log2b = Fast.mostSignificantBit( b );				}#endif#if GENERIC				}#endif#if GENERIC			if ( hasSkips ) {		#endif#if GENERIC || SKIPS				quantumBitLength = entryBitLength = -1;				lowest = Integer.MAX_VALUE;							if ( ASSERTS ) for( int i = height; i > Math.min( height, Fast.mostSignificantBit( frequency >> quantumDivisionShift ) ); i-- ) towerTopB[ i ] = towerLowerB[ i ] = pointerPrediction[ i ] = -1;				final long pointerQuantumSigma = BitStreamIndex.quantumSigma( frequency, index.numberOfDocuments, quantum );				for( int i = Math.min( height, Fast.mostSignificantBit( frequency >> quantumDivisionShift ) ); i >= 0; i-- ) {					towerTopB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i + 1 );					towerTopLog2B[ i ] = Fast.mostSignificantBit( towerTopB[ i ] );					towerLowerB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i );					towerLowerLog2B[ i ] = Fast.mostSignificantBit( towerLowerB[ i ] );					pointerPrediction[ i ] = (int)( ( quantum * ( 1L << i ) * index.numberOfDocuments + frequency / 2 ) / frequency );				}#endif#if GENERIC			}#endif			#if GENERIC || ! #counts(NONE)			count = -1;#endif			currentDocument = -1;			numberOfDocumentRecord = -1;			state = BEFORE_POINTER;		}					public Index index() {			return keyIndex;		}				public int frequency() { 			return frequency;		}		private void ensureCurrentDocument() {			if ( currentDocument < 0 ) throw new IllegalStateException( "nextDocument() has never been called for (term=" + currentTerm + ")" );			if ( currentDocument == Integer.MAX_VALUE ) throw new IllegalStateException( "This reader is positioned beyond the end of list of (term=" + currentTerm + ")" );		}				/** Returns whether there are no more document records in the current inverted list.		 * 		 * <p>This method returns true if the last document pointer of the current inverted		 * list has been read. It makes no distinction as to where (inside the last document		 * record) this reader is currently positioned. In particular, this method will		 * return true independently of whether count and positions have been read or not (we		 * note by passing that this is the only sensible behaviour, as you can build indices		 * with or without counts/positions).		 * 		 * <p>This method will return true also when this reader is positioned <em>beyond</em>		 * the last document pointer. In this case, {@link #currentDocumentPointer()} will		 * return {@link Integer#MAX_VALUE}.		 *		 * @return true whether there are no more document records in the current inverted list.		 */		private boolean endOfList() {			if ( ASSERTS ) assert numberOfDocumentRecord <= frequency;			return numberOfDocumentRecord >= frequency - 1;		}		public int document() {
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -