⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bytearraypostinglist.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.io;/*		 * MG4J: Managing Gigabytes for Java** Copyright (C) 2007 Sebastiano Vigna **  This library is free software; you can redistribute it and/or modify it*  under the terms of the GNU Lesser General Public License as published by the Free*  Software Foundation; either version 2.1 of the License, or (at your option)*  any later version.**  This library is distributed in the hope that it will be useful, but*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License*  for more details.**  You should have received a copy of the GNU Lesser General Public License*  along with this program; if not, write to the Free Software*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.**/import static it.unimi.dsi.io.OutputBitStream.DELTA;import static it.unimi.dsi.io.OutputBitStream.GAMMA;import static it.unimi.dsi.io.OutputBitStream.MAX_PRECOMPUTED;import it.unimi.dsi.bits.Fast;import it.unimi.dsi.fastutil.bytes.ByteArrays;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.tool.Scan;import java.io.File;import java.io.Flushable;import java.io.IOException;/** Lightweight posting accumulator with format similar to that generated by {@link BitStreamIndexWriter}. *  * <p>This class is essentially a dirty trick: it borrows some code and precomputed tables from {@link OutputBitStream} * and exposes two simple methods ({@link #setDocumentPointer(int)} and {@link #addPosition(int)}) with obvious * semantics. The resulting posting list is compressed exactly like an {@link BitStreamIndexWriter} would do (also in this  * case, duplicating some logic found therein). As a result, after completing the calls and after a call to {@link #flush()} * the internal {@link #buffer} can be written directly to a bit stream to build an index (but see {@link #stripPointers(OutputBitStream, long)}). *  * <p>{@link Scan} uses an instance of this class for each indexed term. Instances can be <em>differential</em>, in which * case they assume {@link #setDocumentPointer(int)} will be called with increasing values and store gaps rather * than document pointers. * * @author Sebastiano Vigna * @since 1.2 */public class ByteArrayPostingList implements Flushable {	private final static boolean DEBUG = false;	/** The the enlargement of the backing array causes an out-of-memory error, we set {@link #outOfMemoryError} and try again with a very small increment. This	 * should help in the unlikely (but entirely possible) circumstance that there is not enough memory to double a posting list. */	private final static int EMERGENCY_INCREMENT = 4096;	/** The internal buffer. */	public byte[] buffer;	/** The current frequency (number of calls to {@link #setDocumentPointer(int)}). */	public int frequency;	/** The current global count. */	public long globCount;	/** The current count (number of valid entries in {@link #position}). */	private int count;	/** The maximum count ever seen. */	public int maxCount;	/** If true, this list experienced an {@link OutOfMemoryError} during some buffer reallocation. */	public boolean outOfMemoryError;	/** Current bit buffer. */	private int current;	/** Current number of free bits in the bit buffer (the bits in the buffer are stored high). */	private int free;	/** Current position in the byte buffer. */	private int pos;	/** Current number of bytes available in the byte buffer. */	private int avail;	/** A small, local cache for positions. */	private int[] position;	/** The last document pointer passed to {@link #setDocumentPointer(int)}. */	private int lastPointer;	/** Whether this stream is differential. */	private final boolean differential;			/** Creates a new posting list wrapping a given byte array.	 *	 * @param a the byte array to wrap.	 * @param differential whether this stream should be differential (e.g., whether it should store document pointers as gaps).	 */	public ByteArrayPostingList( final byte[] a, boolean differential ) {		this.differential = differential;		free = 8;		buffer = a;		avail = a.length;		position = new int[ 4 ];		lastPointer = -1;	}	private void write( final int b ) {		if ( avail == 0 ) {			final int oldLength = buffer.length;			try {				buffer = ByteArrays.grow( buffer, buffer.length + 1 );			}			catch( OutOfMemoryError e ) {				outOfMemoryError = true;				try {					// We try at all costs to avoid out-of-memory errors: we dump the buffer, try to allocate a slightly larger array and reload it.					File temp = File.createTempFile( ByteArrayPostingList.class.getSimpleName(), "dump" );					temp.deleteOnExit();					BinIO.storeBytes( buffer, temp );					buffer = null;					buffer = new byte[ oldLength + EMERGENCY_INCREMENT ];					BinIO.loadBytes( temp, buffer );					temp.delete();				}				catch ( IOException f ) {					throw e;				}			}			avail += buffer.length - oldLength;		}		avail--;		buffer[ pos++ ] = (byte)b;	}	/** Flushes the internal bit buffer to the {@linkplain #buffer byte buffer}.	 * 	 * @return the number of bits written.	 */		public int align() {		if ( free != 8 ) return writeInCurrent( 0, free );		else return 0;	}	/*	 * The code below is copied from OutputBitStream.	 */		private int writeInCurrent( final int b, final int len ) {		if ( DEBUG ) if ( len > free ) throw new IllegalArgumentException( Integer.toString( len ) + " bit(s) to write, " + free + " available." );		current |= ( b & ( ( 1 << len ) - 1 ) ) << ( free -= len );		if ( free == 0 ) {			write( current );			free = 8;			current = 0;		}		return len;	}	private int writeInt( int x, final int len ) {		if ( len < 0 || len > 32 ) throw new IllegalArgumentException( "You cannot write " + len + " bits to an integer." );		if ( len <= free ) return writeInCurrent( x, len );		int i = len - free;		final int queue = i & 7;				if ( free != 0 ) writeInCurrent( x >>> i, free );		// Dirty trick: since queue < 8, we pre-write the last bits in the bit buffer.		if ( queue != 0 ) {			i -= queue;			writeInCurrent( x, queue );			x >>>= queue;		}		if ( i == 32 ) write( x >>> 24 );		if ( i > 23 ) write( x >>> 16 );		if ( i > 15 ) write( x >>> 8 );		if ( i > 7 ) write( x );				return len;	}	private int writeUnary( int x ) {		if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );		if ( x < free ) return writeInCurrent( 1, x + 1 );		final int shift = free;		x -= shift;		write( current );		free = 8;		current = 0;		int i = x >> 3;		while( i-- != 0 ) write( 0 );		writeInCurrent( 1, ( x & 7 ) + 1 );		return x + shift + 1;	}	private int writeGamma( int x ) {		if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );		if ( x < MAX_PRECOMPUTED ) return writeInt( GAMMA[ x ], GAMMA[ x ] >>> 26 );				final int msb = Fast.mostSignificantBit( ++x );		final int l = writeUnary( msb );		return l + ( msb != 0 ? writeInt( x, msb ) : 0 );	}	private int writeDelta( int x ) {		if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" );		if ( x < MAX_PRECOMPUTED ) return writeInt( DELTA[ x ], DELTA[ x ] >>> 26 );		final int msb = Fast.mostSignificantBit( ++x );		final int l = writeGamma( msb );		return l + ( msb != 0 ? writeInt( x, msb ) : 0 );	}	/** Flushes the positions cached internally.	 * 	 */	public void flush() {		if ( count != 0 ) {			writeGamma( count - 1 );			globCount += count;			if ( maxCount < count ) maxCount = count;			writeDelta( position[ 0 ] );			for( int i = 1; i < count; i++ ) writeDelta( position[ i ] - position[ i - 1 ] - 1 );			count = 0;		}	}	/** Sets the current document pointer.	 * 	 * <p>If the document pointer is changed since the last call, the positions currently	 * stored are {@linkplain #flush() flushed} and the new pointer is written to the stream.	 * 	 * @param pointer a document pointer.	 */	public void setDocumentPointer( final int pointer ) {		if ( pointer != lastPointer ) {			flush();			writeDelta( differential ? pointer - lastPointer - 1 : pointer );			lastPointer = pointer;			frequency++;		}	}	/** Adds a new position for the current document pointer.	 * 	 * <p>It is mandatory that successive calls to this method for	 * the same document pointer have increasing arguments.	 * 	 * @param pos a position.	 */		public void addPosition( final int pos ) {		if ( lastPointer == -1 ) throw new IllegalStateException();		if ( count == position.length ) position = IntArrays.grow( position, count + 1 );		position[ count++ ] = pos;	}			/** Returns the number of bits written by this posting list.	 * 	 * @return the number of bits written by this posting list.	 */	public long writtenBits() {		return pos * 8L + 8 - free;	}		/** Writes the given number of bits of the internal buffer to the provided output bit stream,	 * stripping all document pointers.	 * 	 * <p>This method is a horrible kluge solving the problem of terms appearing in all documents:	 * {@link BitStreamIndexWriter} would <em>not</em> write pointers in this case, but we do not know 	 * whether we will need pointers or not while we are filling the internal buffer. Thus, for	 * those (hopefully few) termas appearing in all documents this method can be used to	 * dump the internal buffer stripping all pointers.	 * 	 * <p>Note that the valid number of bits should be retrieved using {@link #writtenBits()}	 * after a {@link #flush()}. Then, a call to {@link #align()} will dump to the buffer	 * the bits still floating in the bit buffer; at that point this method can be called safely.	 * 	 * @param obs an output bit stream.	 * @param bitLength the number of bits to be scanned.	 * @throws IOException 	 */	public void stripPointers( final OutputBitStream obs, final long bitLength ) throws IOException {		final InputBitStream ibs = new InputBitStream( buffer );		int count;		while( ibs.readBits() < bitLength ) {			ibs.readDelta(); // Discard pointer			count = ibs.readGamma() + 1;			obs.writeGamma( count - 1 );			while( count-- != 0 ) obs.writeDelta( ibs.readDelta() );		}	}	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -