📄 bitstreamhpindexwriter.java
字号:
package it.unimi.dsi.mg4j.index;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.bits.Fast;import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;import it.unimi.dsi.io.NullOutputStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.mg4j.search.score.VignaScorer;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.util.Properties;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import java.util.Map;/** Writes a bitstream-based high-performance index. The comments about * offsets in the documentation of {@link BitStreamIndexWriter} apply here, too. * * <p>The difference between indices generated by this class and those generated * by {@link BitStreamIndexWriter} lie in the level * of interleaving. Indices generated by this class have positions in a separate stream (similarly to Lucene), and * a compulsory skip structure (an extension of that used by a {@link BitStreamIndexWriter}) * that indexes both the main index file and the positions file. This can result in major performance * improvement in the resolution of position-based operators (e.g., phrases) and in the evaluation * of {@linkplain VignaScorer proximity-based scorers}. Since the overhead due to the additional * skip structure and to the separate positions stream is negligible, indices generated by * this class are the default in MG4J. * * <p>Presently, indices generated by this class cannot carry payloads: you must use a {@link BitStreamIndexWriter} * in that case. Moreover, only nonparametric indices can be used for positions * (this limitation rules out {@link Coding#GOLOMB}, {@link Coding#SKEWED_GOLOMB}, and {@link Coding#INTERPOLATIVE}). * * @author Sebastiano Vigna * @since 1.2 */public class BitStreamHPIndexWriter extends AbstractBitStreamIndexWriter implements IndexWriter { private static final boolean ASSERTS = false; private static final boolean DEBUG = false; private static final boolean COOKIES = false; /** The size of the buffer for the temporary file used to build an inverted list. Inverted lists * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */ public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024; /** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/ protected static final int BEFORE_INVERTED_LIST = 0; /** This value of {@link #state} means that we are positioned at the start of an inverted list, * and we should call {@link #writeFrequency(int)}.*/ protected static final int BEFORE_FREQUENCY = 1; /** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */ protected static final int BEFORE_DOCUMENT_RECORD = 2; /** This value of {@link #state} means that we just started a new document record, and we * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */ protected static final int BEFORE_POINTER = 3; /** This value of {@link #state} can be assumed only in indices that contain payloads; it * means that we are positioned just before the payload for the current document record. */ protected static final int BEFORE_PAYLOAD = 4; /** This value of {@link #state} can be assumed only in indices that contain counts; it * means that we are positioned just before the count for the current document record. */ protected static final int BEFORE_COUNT = 5; /** This value of {@link #state} can be assumed only in indices that contain document positions; * it means that we are positioned just before the position list of the current document record. */ protected static final int BEFORE_POSITIONS = 6; /** This is the first unused state. Subclasses may start from this value to define new states. */ protected static final int FIRST_UNUSED_STATE = 7; /** The underlying index {@link OutputBitStream}. */ protected OutputBitStream obs; /** The underlying positions {@link OutputBitStream}. */ protected OutputBitStream positions; /** The offset {@link OutputBitStream}. */ private OutputBitStream offset; /** The current state of the writer. */ protected int state; /** The number of document records that the current inverted list will contain. */ protected int frequency; /** The number of document records already written for the current inverted list. */ protected int writtenDocuments; /** The current document pointer. */ protected int currentDocument; /** The last document pointer in the current list. */ protected int lastDocument; /** The position (in bytes) where the last inverted list started. */ private long lastInvertedListPos; /** The parameter <code>b</code> for Golomb coding of pointers. */ protected int b; /** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */ protected int log2b; /** The maximum number of positions in a document record so far. */ public int maxCount; /** The number of bits written for offsets in the file of positions. */ public long bitsForPositionsOffsets; /** Maximum number of trials when optimising the entry bit length. */ private final static int MAX_TRY = 32; /** The parameter <code>h</code> (the maximum height of a skip tower). */ private final int h; /** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */ private final int q; /** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */ private final int w; /** The number of document records written in the cache containing the current block. */ private int cache; /** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th * skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code> * contains the first document pointer within the next block. */ private final int[] skipPointer; /** The {@link OutputBitStream}s where cached document pointers are written. */ private final OutputBitStream[] cachePointer; /** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */ private final FastByteArrayOutputStream[] cachePointerByte; /** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip * indices. */ private final OutputBitStream[] cacheSkip; /** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */ private final OutputBitStream[] cacheSkipBitCount; /** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip * indices. */ private final FastByteArrayOutputStream[] cacheSkipByte; /** The {@link OutputBitStream} where cached document data are written. */ private final CachingOutputBitStream cacheDataOut; /** The {@link FastBufferedInputStream} from which cached document data are read. */ private final FastBufferedInputStream cacheDataIn; /** The length of the data segment for each quantum. */ private final int[] cacheDataLength; /** The length of the positions bitstream for each quantum. */ private final long[] cachePositionsLength; /** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */ private final OutputBitStream bitCount; /** The sum of all tower data computed so far. */ public final TowerData towerData; /** The number of bits written to the positions stream at the start of the current quantum. */ private long writtenPositionsBitsAtLastQuantum; /** The number of bits written for quantum lengths. */ public long bitsForQuantumBitLengths; /** The number of bits written for quantum lengths in the positions stream. */ public long bitsForPositionsQuantumBitLengths; /** The number of bits written for entry lenghts. */ public long bitsForEntryBitLengths; /** The number of written blocks. */ public long numberOfBlocks; /** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been * written for the current inverted list. */ public int prevEntryBitLength; /** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been * written for the current inverted list. */ public int prevQuantumBitLength; /** An estimate on the number of bits occupied per quantum in the positions stream in the last written cache, or -1 if no cache has been * written for the current inverted list. */ public int prevPositionsQuantumBitLength; /** The Golomb modulus for a top pointer skip, for each level. */ private final int[] towerTopB; /** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */ private final int[] towerTopLog2B; /** The Golomb modulus for a lower pointer skip, for each level. */ private final int[] towerLowerB; /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */ private final int[] towerLowerLog2B; /** The prediction for a pointer skip, for each level. */ private final int[] pointerPrediction; /** The <var>k</var>-th entry of this array contains the number of bits from the start of * the <var>k</var>-th skip tower up to the end of the current block (more precisely, * to the point that should be reached via skipping, which is just after the document pointer). * Indices are skip indices. It is used just by {@link #tryTower(int, int, long, OutputBitStream[], TowerData, boolean)}, * but it is declared here for efficiency. */ final private long[] distance; /** The temporary file dumping the index data contained in a block. */ final private File tempFile; /** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>). * If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). * * @param basename the basename. * @param numberOfDocuments the number of documents in the collection to be indexed. * @param writeOffsets if <code>true</code>, the offset file will also be produced. * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}). */ public BitStreamHPIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException { this( new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ), new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.POSITIONS_EXTENSION ) ), writeOffsets? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION) ) : null, numberOfDocuments, tempBufferSize, flags, q , h ); } /** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}. * * @param obs the underlying output bit stream. * @param offset the offset bit stream, or <code>null</code> if offsets should not be written. * @param numberOfDocuments the number of documents in the collection to be indexed. * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}). * @throws IOException */ public BitStreamHPIndexWriter( final OutputBitStream obs, final OutputBitStream positions, final OutputBitStream offset, final int numberOfDocuments, int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException { super( numberOfDocuments, flags ); this.obs = obs; this.positions = positions; this.offset = offset; this.frequency = -1; this.currentTerm = -1; this.maxCount = 0; if ( ! hasCounts && hasPositions ) throw new IllegalArgumentException( "Index would have positions but no counts (this can't happen)" ); if ( h < 0 ) throw new IllegalArgumentException( "Illegal height " + h ); if ( q <= 0 || ( q & -q ) != q ) throw new IllegalArgumentException( "Illegal quantum " + q ); this.h = h; this.q = q; int two2h = 1 << h; w = two2h * q; if ( DEBUG ) { System.err.println( "Cache will contain at most " + w + " records (q=" + q + ",h=" + h + ")" ); System.err.print( "Skip records will be " ); for ( int i = 0; i < two2h; i++ ) System.err.print( ( i * q ) + " " ); System.err.println(); } towerData = new TowerData(); tempFile = File.createTempFile( "MG4J", ".data" ); cacheDataIn = new FastBufferedInputStream( new FileInputStream( tempFile ) ); cacheDataOut = new CachingOutputBitStream( tempFile, tempBufferSize ); cacheDataLength = new int[two2h]; cachePositionsLength = new long[two2h + 1]; cachePointer = new OutputBitStream[two2h]; cachePointerByte = new FastByteArrayOutputStream[two2h]; for ( int i = 0; i < two2h; i++ ) cachePointer[i] = new OutputBitStream( cachePointerByte[i] = new FastByteArrayOutputStream(), 0 ); cacheSkip = new OutputBitStream[two2h]; cacheSkipBitCount = new OutputBitStream[two2h]; cacheSkipByte = new FastByteArrayOutputStream[two2h]; for ( int i = 0; i < two2h; i++ ) { cacheSkip[ i ] = new OutputBitStream( cacheSkipByte[i] = new FastByteArrayOutputStream(), 0 ); cacheSkipBitCount[ i ] = new OutputBitStream( NullOutputStream.getInstance(), 0 ); } skipPointer = new int[ two2h + 1 ]; distance = new long[ two2h + 1 ]; bitCount = new OutputBitStream( NullOutputStream.getInstance(), 0 ); towerTopB = new int[ h + 1 ]; towerTopLog2B = new int[ h + 1 ]; towerLowerB = new int[ h + 1 ]; towerLowerLog2B = new int[ h + 1 ]; pointerPrediction = new int[ h + 1 ]; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -