📄 skipbitstreamindexwriter.java
字号:
package it.unimi.dsi.mg4j.index;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.bits.Fast;import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;import it.unimi.dsi.io.NullOutputStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.util.Properties;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import java.io.PrintWriter;import java.util.Map;/** Provides facilities to write skip inverted indices, * that is, inverted indices with an additional skip structure. A skip inverted index * allows one to skip ahead when reading inverted lists. More specifically, * when reading the inverted list relative to a certain term, one may want to * decide to skip all document records that concern documents with pointer * less than a given integer. In a normal inverted index this is impossible: * one would have to read all document records sequentially. * * <p>The skipping structure used by this class is new: details can be found * <a href="http://vigna.dsi.unimi.it/papers.php#BoVCPESLQIIL">here</a>. * * @author Paolo Boldi * @author Sebastiano Vigna * @since 0.6 */public class SkipBitStreamIndexWriter extends BitStreamIndexWriter { /** The size of the buffer for the temporary file used to build an inverted list. Inverted lists * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */ public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024; private static final boolean ASSERTS = false; private static final boolean DEBUG = false; private final static boolean STATS = false; /** Maximum number of trials when optimising the entry bit length. */ private final static int MAX_TRY = 32; /** The parameter <code>h</code> (the maximum height of a skip tower). */ private final int h; /** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */ private final int q; /** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */ private final int w; /** The number of document records written in the cache containing the current block. */ private int cache; /** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th * skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code> * contains the first document pointer within the next block. */ private final int[] skipPointer; /** The {@link OutputBitStream}s where cached document pointers are written. */ private final OutputBitStream[] cachePointer; /** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */ private final FastByteArrayOutputStream[] cachePointerByte; /** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip * indices. */ private final OutputBitStream[] cacheSkip; /** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */ private final OutputBitStream[] cacheSkipBitCount; /** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip * indices. */ private final FastByteArrayOutputStream[] cacheSkipByte; /** The {@link OutputBitStream} where cached document data are written. */ private final CachingOutputBitStream cacheDataOut; /** The {@link FastBufferedInputStream} from which cached document data are read. */ private final FastBufferedInputStream cacheDataIn; /** The length of the data segment for each quantum. */ private final int[] cacheDataLength; /** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */ private final OutputBitStream bitCount; /** The sum of all tower data computed so far. */ public final TowerData towerData; /** The number of bits written for quantum lengths. */ public long bitsForQuantumBitLengths; /** The number of bits written for entry lenghts. */ public long bitsForEntryBitLengths; /** The number of written blocks. */ public long numberOfBlocks; /** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been * written for the current inverted list. */ public int prevEntryBitLength; /** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been * written for the current inverted list. */ public int prevQuantumBitLength; /** The Golomb modulus for a top pointer skip, for each level. */ private final int[] towerTopB; /** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */ private final int[] towerTopLog2B; /** The Golomb modulus for a lower pointer skip, for each level. */ private final int[] towerLowerB; /** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */ private final int[] towerLowerLog2B; /** The prediction for a pointer skip, for each level. */ private final int[] pointerPrediction; private java.io.PrintWriter pointerSkipStats; private java.io.PrintWriter pointerTopSkipStats; private java.io.PrintWriter bitSkipStats; private java.io.PrintWriter bitTopSkipStats; private String pointerSkipLine, bitSkipLine; /** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>). * If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). * * <p>The size of the internal temporary buffer will be {@link #DEFAULT_TEMP_BUFFER_SIZE}. * * @param basename the basename. * @param numberOfDocuments the number of documents in the collection to be indexed. * @param writeOffsets if <code>true</code>, the offset file will also be produced. * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}). * @param q the cache contains at most <var>2<sup>h</sup></var> document records. * @param h the maximum height of a skip tower. */ public SkipBitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, final Map<Component,Coding> flags, final int q, final int h ) throws IOException { this( new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ), writeOffsets ? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) ) : null, numberOfDocuments, DEFAULT_TEMP_BUFFER_SIZE, flags, q, h ); } /** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>). * If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). * * @param basename the basename. * @param numberOfDocuments the number of documents in the collection to be indexed. * @param writeOffsets if <code>true</code>, the offset file will also be produced. * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk). * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}). * @param q the cache contains at most <var>2<sup>h</sup></var> document records. * @param h the maximum height of a skip tower. */ public SkipBitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException { this( new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ), writeOffsets ? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) ) : null, numberOfDocuments, tempBufferSize, flags, q, h ); } /** * Creates a new skip index writer. * * @param obs the underlying output bit stream. * @param offset the offset bit stream. * @param N the number of documents in the collection to be indexed. * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk). * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}). * @param q the cache contains at most <var>2<sup>h</sup></var> document records. * @param h the maximum height of a skip tower. * @throws IOException */ public SkipBitStreamIndexWriter( final OutputBitStream obs, final OutputBitStream offset, final int N, int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException { super( obs, offset, N, flags ); if ( h < 0 ) throw new IllegalArgumentException( "Illegal height " + h ); if ( q <= 0 || ( q & -q ) != q ) throw new IllegalArgumentException( "Illegal quantum " + q ); this.h = h; this.q = q; int two2h = 1 << h; w = two2h * q; if ( DEBUG ) { System.err.println( "Cache will contain at most " + w + " records (q=" + q + ",h=" + h + ")" ); System.err.print( "Skip records will be " ); for ( int i = 0; i < two2h; i++ ) System.err.print( ( i * q ) + " " ); System.err.println(); } towerData = new TowerData(); tempFile = File.createTempFile( "MG4J", ".data" ); cacheDataIn = new FastBufferedInputStream( new FileInputStream( tempFile ) ); cacheDataOut = new CachingOutputBitStream( tempFile, tempBufferSize ); cacheDataLength = new int[two2h]; cachePointer = new OutputBitStream[two2h]; cachePointerByte = new FastByteArrayOutputStream[two2h]; for ( int i = 0; i < two2h; i++ ) cachePointer[i] = new OutputBitStream( cachePointerByte[i] = new FastByteArrayOutputStream(), 0 ); cacheSkip = new OutputBitStream[two2h]; cacheSkipBitCount = new OutputBitStream[two2h]; cacheSkipByte = new FastByteArrayOutputStream[two2h]; for ( int i = 0; i < two2h; i++ ) { cacheSkip[ i ] = new OutputBitStream( cacheSkipByte[i] = new FastByteArrayOutputStream(), 0 ); cacheSkipBitCount[ i ] = new OutputBitStream( NullOutputStream.getInstance(), 0 ); } skipPointer = new int[ two2h + 1 ]; distance = new long[ two2h + 1 ]; bitCount = new OutputBitStream( NullOutputStream.getInstance(), 0 ); towerTopB = new int[ h + 1 ]; towerTopLog2B = new int[ h + 1 ]; towerLowerB = new int[ h + 1 ]; towerLowerLog2B = new int[ h + 1 ]; pointerPrediction = new int[ h + 1 ]; if ( STATS ) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -