📄 bitstreamhpindexwriter.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
package it.unimi.dsi.mg4j.index;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.bits.Fast;import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;import it.unimi.dsi.io.NullOutputStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.mg4j.search.score.VignaScorer;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.util.Properties;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import java.util.Map;/** Writes a bitstream-based high-performance index. The comments about * offsets in the documentation of {@link BitStreamIndexWriter} apply here, too. * * <p>The difference between indices generated by this class and those generated * by {@link BitStreamIndexWriter} lie in the level * of interleaving. Indices generated by this class have positions in a separate stream (similarly to Lucene), and * a compulsory skip structure (an extension of that used by a {@link BitStreamIndexWriter}) * that indexes both the main index file and the positions file. This can result in major performance * improvement in the resolution of position-based operators (e.g., phrases) and in the evaluation * of {@linkplain VignaScorer proximity-based scorers}. Since the overhead due to the additional * skip structure and to the separate positions stream is negligible, indices generated by * this class are the default in MG4J. *  * <p>Presently, indices generated by this class cannot carry payloads: you must use a {@link BitStreamIndexWriter} * in that case. Moreover, only nonparametric indices can be used for positions  * (this limitation rules out {@link Coding#GOLOMB}, {@link Coding#SKEWED_GOLOMB}, and {@link Coding#INTERPOLATIVE}). *  * @author Sebastiano Vigna  * @since 1.2 */public class BitStreamHPIndexWriter extends AbstractBitStreamIndexWriter implements IndexWriter {	private static final boolean ASSERTS = false;	private static final boolean DEBUG = false;	private static final boolean COOKIES = false;		/** The size of the buffer for the temporary file used to build an inverted list. Inverted lists	 * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */ 	public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024;	/** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/	protected static final int BEFORE_INVERTED_LIST = 0;	/** This value of {@link #state} means that we are positioned at the start of an inverted list,	 * and we should call {@link #writeFrequency(int)}.*/	protected static final int BEFORE_FREQUENCY = 1;	/** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */	protected static final int BEFORE_DOCUMENT_RECORD = 2;	/** This value of {@link #state} means that we just started a new document record, and we	 * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */	protected static final int BEFORE_POINTER = 3;	/** This value of {@link #state} can be assumed only in indices that contain payloads; it	 * means that we are positioned just before the payload for the current document record. */	protected static final int BEFORE_PAYLOAD = 4;	/** This value of {@link #state} can be assumed only in indices that contain counts; it	 * means that we are positioned just before the count for the current document record. */	protected static final int BEFORE_COUNT = 5;	/** This value of {@link #state} can be assumed only in indices that contain document positions; 	 * it means that we are positioned just before the position list of the current document record. */	protected static final int BEFORE_POSITIONS = 6;	/** This is the first unused state. Subclasses may start from this value to define new states. */	protected static final int FIRST_UNUSED_STATE = 7;	/** The underlying index {@link OutputBitStream}. */	protected OutputBitStream obs;	/** The underlying positions {@link OutputBitStream}. */	protected OutputBitStream positions;	/** The offset {@link OutputBitStream}. */	private OutputBitStream offset;	/** The current state of the writer. */	protected int state;	/** The number of document records that the current inverted list will contain. */	protected int frequency;	/** The number of document records already written for the current inverted list. */	protected int writtenDocuments;	/** The current document pointer. */	protected int currentDocument;	/** The last document pointer in the current list. */	protected int lastDocument;	/** The position (in bytes) where the last inverted list started. */	private long lastInvertedListPos;	/** The parameter <code>b</code> for Golomb coding of pointers. */	protected int b;	/** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */	protected int log2b;	/** The maximum number of positions in a document record so far. */	public int maxCount;	/** The number of bits written for offsets in the file of positions. */	public long bitsForPositionsOffsets;	/** Maximum number of trials when optimising the entry bit length. */	private final static int MAX_TRY = 32;	/** The parameter <code>h</code> (the maximum height of a skip tower). */	private final int h;	/** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */	private final int q;	/** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */	private final int w;	/** The number of document records written in the cache containing the current block. */	private int cache;	/** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th	 *  skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code>	 *  contains the first document pointer within the next block. */	private final int[] skipPointer;	/** The {@link OutputBitStream}s where cached document pointers are written. */	private final OutputBitStream[] cachePointer;	/** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */	private final FastByteArrayOutputStream[] cachePointerByte;	/** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip	 *  indices. */	private final OutputBitStream[] cacheSkip;	/** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */	private final OutputBitStream[] cacheSkipBitCount;	/** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip	*  indices. */	private final FastByteArrayOutputStream[] cacheSkipByte;	/** The {@link OutputBitStream} where cached document data are written. */	private final CachingOutputBitStream cacheDataOut;	/** The {@link FastBufferedInputStream} from which cached document data are read. */	private final FastBufferedInputStream cacheDataIn;	/** The length of the data segment for each quantum. */	private final int[] cacheDataLength;	/** The length of the positions bitstream for each quantum. */	private final long[] cachePositionsLength;	/** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */	private final OutputBitStream bitCount;	/** The sum of all tower data computed so far. */	public final TowerData towerData;	/** The number of bits written to the positions stream at the start of the current quantum. */	private long writtenPositionsBitsAtLastQuantum;		/** The number of bits written for quantum lengths. */	public long bitsForQuantumBitLengths;	/** The number of bits written for quantum lengths in the positions stream. */	public long bitsForPositionsQuantumBitLengths;	/** The number of bits written for entry lenghts. */	public long bitsForEntryBitLengths;	/** The number of written blocks. */	public long numberOfBlocks;	/** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been	 * written for the current inverted list. */	public int prevEntryBitLength;	/** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been	 * written for the current inverted list. */	public int prevQuantumBitLength;	/** An estimate on the number of bits occupied per quantum in the positions stream in the last written cache, or -1 if no cache has been	 * written for the current inverted list. */	public int prevPositionsQuantumBitLength;	/** The Golomb modulus for a top pointer skip, for each level. */	private final int[] towerTopB;		/** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */	private final int[] towerTopLog2B;		/** The Golomb modulus for a lower pointer skip, for each level. */	private final int[] towerLowerB;		/** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */	private final int[] towerLowerLog2B;		/** The prediction for a pointer skip, for each level. */	private final int[] pointerPrediction;	/** The <var>k</var>-th entry of this array contains the number of bits from the start of	 * the <var>k</var>-th skip tower up to the end of the current block (more precisely,	 * to the point that should be reached via skipping, which is just after the document pointer).	 * Indices are skip indices. It is used just by {@link #tryTower(int, int, long, OutputBitStream[], TowerData, boolean)}, 	 * but it is declared here for efficiency.	 */	final private long[] distance;	/** The temporary file dumping the index data contained in a block. */	final private File tempFile;		/** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).	 *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 	 * 	 * @param basename the basename.	 * @param numberOfDocuments the number of documents in the collection to be indexed.	 * @param writeOffsets if <code>true</code>, the offset file will also be produced.	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 */	public BitStreamHPIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets,  int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException {		this( 			new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ),			new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.POSITIONS_EXTENSION ) ),			writeOffsets? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION) ) : null,			numberOfDocuments,			tempBufferSize,			flags, q , h		 );	}	/** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.	 *	 * @param obs the underlying output bit stream.	 * @param offset the offset bit stream, or <code>null</code> if offsets should not be written.	 * @param numberOfDocuments the number of documents in the collection to be indexed.	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 * @throws IOException 	 */	public BitStreamHPIndexWriter( final OutputBitStream obs, final OutputBitStream positions, final OutputBitStream offset, final int numberOfDocuments, int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException {		super( numberOfDocuments, flags );		this.obs = obs;		this.positions = positions;		this.offset = offset;		this.frequency = -1;		this.currentTerm = -1;		this.maxCount = 0;		if ( ! hasCounts && hasPositions ) throw new IllegalArgumentException( "Index would have positions but no counts (this can't happen)" );		if ( h < 0 ) throw new IllegalArgumentException( "Illegal height " + h );		if ( q <= 0 || ( q & -q ) != q ) throw new IllegalArgumentException( "Illegal quantum " + q );		this.h = h;		this.q = q;		int two2h = 1 << h;		w = two2h * q;		if ( DEBUG ) {			System.err.println( "Cache will contain at most " + w + " records (q=" + q + ",h=" + h + ")" );			System.err.print( "Skip records will be " );			for ( int i = 0; i < two2h; i++ ) System.err.print( ( i * q ) + " " );			System.err.println();		}		towerData = new TowerData();		tempFile = File.createTempFile( "MG4J", ".data" );		cacheDataIn = new FastBufferedInputStream( new FileInputStream( tempFile ) );		cacheDataOut = new CachingOutputBitStream( tempFile, tempBufferSize );		cacheDataLength = new int[two2h];		cachePositionsLength = new long[two2h + 1];		cachePointer = new OutputBitStream[two2h];		cachePointerByte = new FastByteArrayOutputStream[two2h];				for ( int i = 0; i < two2h; i++ ) 			cachePointer[i] = new OutputBitStream( cachePointerByte[i] = new FastByteArrayOutputStream(), 0 );		cacheSkip = new OutputBitStream[two2h];		cacheSkipBitCount = new OutputBitStream[two2h];		cacheSkipByte = new FastByteArrayOutputStream[two2h];		for ( int i = 0; i < two2h; i++ ) {			cacheSkip[ i ] = new OutputBitStream( cacheSkipByte[i] = new FastByteArrayOutputStream(), 0 );			cacheSkipBitCount[ i ] = new OutputBitStream( NullOutputStream.getInstance(), 0 );		}			skipPointer = new int[ two2h + 1 ];		distance = new long[ two2h + 1 ];		bitCount = new OutputBitStream( NullOutputStream.getInstance(), 0 );		towerTopB = new int[ h + 1 ];		towerTopLog2B = new int[ h + 1 ];		towerLowerB = new int[ h + 1 ];		towerLowerLog2B = new int[ h + 1 ];		pointerPrediction = new int[ h + 1 ];	}
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -