📄 bytearraypostinglist.java
字号:
package it.unimi.dsi.mg4j.io;/* * MG4J: Managing Gigabytes for Java** Copyright (C) 2007 Sebastiano Vigna ** This library is free software; you can redistribute it and/or modify it* under the terms of the GNU Lesser General Public License as published by the Free* Software Foundation; either version 2.1 of the License, or (at your option)* any later version.** This library is distributed in the hope that it will be useful, but* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License* for more details.** You should have received a copy of the GNU Lesser General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.**/import static it.unimi.dsi.io.OutputBitStream.DELTA;import static it.unimi.dsi.io.OutputBitStream.GAMMA;import static it.unimi.dsi.io.OutputBitStream.MAX_PRECOMPUTED;import it.unimi.dsi.bits.Fast;import it.unimi.dsi.fastutil.bytes.ByteArrays;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.tool.Scan;import java.io.File;import java.io.Flushable;import java.io.IOException;/** Lightweight posting accumulator with format similar to that generated by {@link BitStreamIndexWriter}. * * <p>This class is essentially a dirty trick: it borrows some code and precomputed tables from {@link OutputBitStream} * and exposes two simple methods ({@link #setDocumentPointer(int)} and {@link #addPosition(int)}) with obvious * semantics. The resulting posting list is compressed exactly like an {@link BitStreamIndexWriter} would do (also in this * case, duplicating some logic found therein). As a result, after completing the calls and after a call to {@link #flush()} * the internal {@link #buffer} can be written directly to a bit stream to build an index (but see {@link #stripPointers(OutputBitStream, long)}). * * <p>{@link Scan} uses an instance of this class for each indexed term. Instances can be <em>differential</em>, in which * case they assume {@link #setDocumentPointer(int)} will be called with increasing values and store gaps rather * than document pointers. * * @author Sebastiano Vigna * @since 1.2 */public class ByteArrayPostingList implements Flushable { private final static boolean DEBUG = false; /** The the enlargement of the backing array causes an out-of-memory error, we set {@link #outOfMemoryError} and try again with a very small increment. This * should help in the unlikely (but entirely possible) circumstance that there is not enough memory to double a posting list. */ private final static int EMERGENCY_INCREMENT = 4096; /** The internal buffer. */ public byte[] buffer; /** The current frequency (number of calls to {@link #setDocumentPointer(int)}). */ public int frequency; /** The current global count. */ public long globCount; /** The current count (number of valid entries in {@link #position}). */ private int count; /** The maximum count ever seen. */ public int maxCount; /** If true, this list experienced an {@link OutOfMemoryError} during some buffer reallocation. */ public boolean outOfMemoryError; /** Current bit buffer. */ private int current; /** Current number of free bits in the bit buffer (the bits in the buffer are stored high). */ private int free; /** Current position in the byte buffer. */ private int pos; /** Current number of bytes available in the byte buffer. */ private int avail; /** A small, local cache for positions. */ private int[] position; /** The last document pointer passed to {@link #setDocumentPointer(int)}. */ private int lastPointer; /** Whether this stream is differential. */ private final boolean differential; /** Creates a new posting list wrapping a given byte array. * * @param a the byte array to wrap. * @param differential whether this stream should be differential (e.g., whether it should store document pointers as gaps). */ public ByteArrayPostingList( final byte[] a, boolean differential ) { this.differential = differential; free = 8; buffer = a; avail = a.length; position = new int[ 4 ]; lastPointer = -1; } private void write( final int b ) { if ( avail == 0 ) { final int oldLength = buffer.length; try { buffer = ByteArrays.grow( buffer, buffer.length + 1 ); } catch( OutOfMemoryError e ) { outOfMemoryError = true; try { // We try at all costs to avoid out-of-memory errors: we dump the buffer, try to allocate a slightly larger array and reload it. File temp = File.createTempFile( ByteArrayPostingList.class.getSimpleName(), "dump" ); temp.deleteOnExit(); BinIO.storeBytes( buffer, temp ); buffer = null; buffer = new byte[ oldLength + EMERGENCY_INCREMENT ]; BinIO.loadBytes( temp, buffer ); temp.delete(); } catch ( IOException f ) { throw e; } } avail += buffer.length - oldLength; } avail--; buffer[ pos++ ] = (byte)b; } /** Flushes the internal bit buffer to the {@linkplain #buffer byte buffer}. * * @return the number of bits written. */ public int align() { if ( free != 8 ) return writeInCurrent( 0, free ); else return 0; } /* * The code below is copied from OutputBitStream. */ private int writeInCurrent( final int b, final int len ) { if ( DEBUG ) if ( len > free ) throw new IllegalArgumentException( Integer.toString( len ) + " bit(s) to write, " + free + " available." ); current |= ( b & ( ( 1 << len ) - 1 ) ) << ( free -= len ); if ( free == 0 ) { write( current ); free = 8; current = 0; } return len; } private int writeInt( int x, final int len ) { if ( len < 0 || len > 32 ) throw new IllegalArgumentException( "You cannot write " + len + " bits to an integer." ); if ( len <= free ) return writeInCurrent( x, len ); int i = len - free; final int queue = i & 7; if ( free != 0 ) writeInCurrent( x >>> i, free ); // Dirty trick: since queue < 8, we pre-write the last bits in the bit buffer. if ( queue != 0 ) { i -= queue; writeInCurrent( x, queue ); x >>>= queue; } if ( i == 32 ) write( x >>> 24 ); if ( i > 23 ) write( x >>> 16 ); if ( i > 15 ) write( x >>> 8 ); if ( i > 7 ) write( x ); return len; } private int writeUnary( int x ) { if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" ); if ( x < free ) return writeInCurrent( 1, x + 1 ); final int shift = free; x -= shift; write( current ); free = 8; current = 0; int i = x >> 3; while( i-- != 0 ) write( 0 ); writeInCurrent( 1, ( x & 7 ) + 1 ); return x + shift + 1; } private int writeGamma( int x ) { if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" ); if ( x < MAX_PRECOMPUTED ) return writeInt( GAMMA[ x ], GAMMA[ x ] >>> 26 ); final int msb = Fast.mostSignificantBit( ++x ); final int l = writeUnary( msb ); return l + ( msb != 0 ? writeInt( x, msb ) : 0 ); } private int writeDelta( int x ) { if ( x < 0 ) throw new IllegalArgumentException( "The argument " + x + " is negative" ); if ( x < MAX_PRECOMPUTED ) return writeInt( DELTA[ x ], DELTA[ x ] >>> 26 ); final int msb = Fast.mostSignificantBit( ++x ); final int l = writeGamma( msb ); return l + ( msb != 0 ? writeInt( x, msb ) : 0 ); } /** Flushes the positions cached internally. * */ public void flush() { if ( count != 0 ) { writeGamma( count - 1 ); globCount += count; if ( maxCount < count ) maxCount = count; writeDelta( position[ 0 ] ); for( int i = 1; i < count; i++ ) writeDelta( position[ i ] - position[ i - 1 ] - 1 ); count = 0; } } /** Sets the current document pointer. * * <p>If the document pointer is changed since the last call, the positions currently * stored are {@linkplain #flush() flushed} and the new pointer is written to the stream. * * @param pointer a document pointer. */ public void setDocumentPointer( final int pointer ) { if ( pointer != lastPointer ) { flush(); writeDelta( differential ? pointer - lastPointer - 1 : pointer ); lastPointer = pointer; frequency++; } } /** Adds a new position for the current document pointer. * * <p>It is mandatory that successive calls to this method for * the same document pointer have increasing arguments. * * @param pos a position. */ public void addPosition( final int pos ) { if ( lastPointer == -1 ) throw new IllegalStateException(); if ( count == position.length ) position = IntArrays.grow( position, count + 1 ); position[ count++ ] = pos; } /** Returns the number of bits written by this posting list. * * @return the number of bits written by this posting list. */ public long writtenBits() { return pos * 8L + 8 - free; } /** Writes the given number of bits of the internal buffer to the provided output bit stream, * stripping all document pointers. * * <p>This method is a horrible kluge solving the problem of terms appearing in all documents: * {@link BitStreamIndexWriter} would <em>not</em> write pointers in this case, but we do not know * whether we will need pointers or not while we are filling the internal buffer. Thus, for * those (hopefully few) termas appearing in all documents this method can be used to * dump the internal buffer stripping all pointers. * * <p>Note that the valid number of bits should be retrieved using {@link #writtenBits()} * after a {@link #flush()}. Then, a call to {@link #align()} will dump to the buffer * the bits still floating in the bit buffer; at that point this method can be called safely. * * @param obs an output bit stream. * @param bitLength the number of bits to be scanned. * @throws IOException */ public void stripPointers( final OutputBitStream obs, final long bitLength ) throws IOException { final InputBitStream ibs = new InputBitStream( buffer ); int count; while( ibs.readBits() < bitLength ) { ibs.readDelta(); // Discard pointer count = ibs.readGamma() + 1; obs.writeGamma( count - 1 ); while( count-- != 0 ) obs.writeDelta( ibs.readDelta() ); } } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -