⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 skipbitstreamindexwriter.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
package it.unimi.dsi.mg4j.index;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.bits.Fast;import it.unimi.dsi.fastutil.ints.Int2IntRBTreeMap;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;import it.unimi.dsi.io.NullOutputStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.util.Properties;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import java.io.PrintWriter;import java.util.Map;/** Provides facilities to write skip inverted indices, *  that is, inverted indices with an additional skip structure. A skip inverted index *  allows one to skip ahead when reading inverted lists. More specifically, *  when reading the inverted list relative to a certain term, one may want to *  decide to skip all document records that concern documents with pointer *  less than a given integer. In a normal inverted index this is impossible: *  one would have to read all document records sequentially. *   *  <p>The skipping structure used by this class is new: details can be found  *  <a href="http://vigna.dsi.unimi.it/papers.php#BoVCPESLQIIL">here</a>. *   * @author Paolo Boldi * @author Sebastiano Vigna * @since 0.6 */public class SkipBitStreamIndexWriter extends BitStreamIndexWriter {	/** The size of the buffer for the temporary file used to build an inverted list. Inverted lists	 * shorter than this number of bytes will be directly rebuilt from the buffer, and never flushed to disk. */ 	public final static int DEFAULT_TEMP_BUFFER_SIZE = 64 * 1024 * 1024;	private static final boolean ASSERTS = false;	private static final boolean DEBUG = false;	private final static boolean STATS = false;	/** Maximum number of trials when optimising the entry bit length. */	private final static int MAX_TRY = 32;	/** The parameter <code>h</code> (the maximum height of a skip tower). */	private final int h;	/** The parameter <code>q</code> (2<var><sup>h</sup>q</var> documents record are kept in the cache); necessarily a power of two. */	private final int q;	/** We have <var>w</var>=2<sup><var>h</var></sup><var>q</var>. */	private final int w;	/** The number of document records written in the cache containing the current block. */	private int cache;	/** The <var>k</var>-th entry of this array contains the document pointer of the <var>k</var>-th	 *  skip document record within the current block. For sake of simplicity, <code>pointer[cache]</code>	 *  contains the first document pointer within the next block. */	private final int[] skipPointer;	/** The {@link OutputBitStream}s where cached document pointers are written. */	private final OutputBitStream[] cachePointer;	/** The {@link FastByteArrayOutputStream}s underlying <code>cachePointer</code> . */	private final FastByteArrayOutputStream[] cachePointerByte;	/** The {@link OutputBitStream}s where cached skip towers are written. Indices are skip	 *  indices. */	private final OutputBitStream[] cacheSkip;	/** An array whose entries (as many as those of {@link #cacheSkip}) are all {@link #bitCount}. */	private final OutputBitStream[] cacheSkipBitCount;	/** The {@link FastByteArrayOutputStream}s underlying <code>cacheSkip</code> . Indices are skip	*  indices. */	private final FastByteArrayOutputStream[] cacheSkipByte;	/** The {@link OutputBitStream} where cached document data are written. */	private final CachingOutputBitStream cacheDataOut;	/** The {@link FastBufferedInputStream} from which cached document data are read. */	private final FastBufferedInputStream cacheDataIn;	/** The length of the data segment for each quantum. */	private final int[] cacheDataLength;	/** An {@link OutputBitStream} wrapping a {@link NullOutputStream} for code-length preview. */	private final OutputBitStream bitCount;	/** The sum of all tower data computed so far. */	public final TowerData towerData;	/** The number of bits written for quantum lengths. */	public long bitsForQuantumBitLengths;	/** The number of bits written for entry lenghts. */	public long bitsForEntryBitLengths;	/** The number of written blocks. */	public long numberOfBlocks;	/** An estimate on the number of bits occupied per tower entry in the last written cache, or -1 if no cache has been	 * written for the current inverted list. */	public int prevEntryBitLength;	/** An estimate on the number of bits occupied per quantum in the last written cache, or -1 if no cache has been	 * written for the current inverted list. */	public int prevQuantumBitLength;	/** The Golomb modulus for a top pointer skip, for each level. */	private final int[] towerTopB;		/** The most significant bit of the Golomb modulus for a top pointer skip, for each level. */	private final int[] towerTopLog2B;		/** The Golomb modulus for a lower pointer skip, for each level. */	private final int[] towerLowerB;		/** The most significant bit of the Golomb modulus for a lower pointer skip, for each level. */	private final int[] towerLowerLog2B;		/** The prediction for a pointer skip, for each level. */	private final int[] pointerPrediction;		private java.io.PrintWriter pointerSkipStats;	private java.io.PrintWriter pointerTopSkipStats;	private java.io.PrintWriter bitSkipStats;	private java.io.PrintWriter bitTopSkipStats;	private String pointerSkipLine, bitSkipLine;	/** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).	 *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 	 *  	 *  <p>The size of the internal temporary buffer will be {@link #DEFAULT_TEMP_BUFFER_SIZE}.	 * 	 * @param basename the basename.	 * @param numberOfDocuments the number of documents in the collection to be indexed.	 * @param writeOffsets if <code>true</code>, the offset file will also be produced.	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 * @param q the cache contains at most <var>2<sup>h</sup></var> document records.	 * @param h the maximum height of a skip tower.	 */	public SkipBitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, final Map<Component,Coding> flags, final int q, final int h ) throws IOException {		this( new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ), writeOffsets ? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) ) : null, numberOfDocuments, DEFAULT_TEMP_BUFFER_SIZE, flags, q, h );	}	/** Creates a new skip index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).	 *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 	 * 	 * @param basename the basename.	 * @param numberOfDocuments the number of documents in the collection to be indexed.	 * @param writeOffsets if <code>true</code>, the offset file will also be produced.	 * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk). 	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 * @param q the cache contains at most <var>2<sup>h</sup></var> document records.	 * @param h the maximum height of a skip tower.	 */	public SkipBitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, int tempBufferSize, final Map<Component,Coding> flags, final int q, final int h ) throws IOException {		this( new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ), writeOffsets ? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) ) : null, numberOfDocuments, tempBufferSize, flags, q, h );	}	/**	 * Creates a new skip index writer.	 * 	 * @param obs the underlying output bit stream.	 * @param offset the offset bit stream.	 * @param N the number of documents in the collection to be indexed.	 * @param tempBufferSize the size in bytes of the internal temporary buffer (inverted lists shorter than this size will never be flushed to disk). 	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 * @param q the cache contains at most <var>2<sup>h</sup></var> document records.	 * @param h the maximum height of a skip tower.	 * @throws IOException	 */	public SkipBitStreamIndexWriter( final OutputBitStream obs, final OutputBitStream offset, final int N, int tempBufferSize,		final Map<Component,Coding> flags, final int q, final int h ) throws IOException {		super( obs, offset, N, flags );		if ( h < 0 ) throw new IllegalArgumentException( "Illegal height " + h );		if ( q <= 0 || ( q & -q ) != q ) throw new IllegalArgumentException( "Illegal quantum " + q );		this.h = h;		this.q = q;		int two2h = 1 << h;		w = two2h * q;		if ( DEBUG ) {			System.err.println( "Cache will contain at most " + w + " records (q=" + q + ",h=" + h + ")" );			System.err.print( "Skip records will be " );			for ( int i = 0; i < two2h; i++ ) System.err.print( ( i * q ) + " " );			System.err.println();		}		towerData = new TowerData();		tempFile = File.createTempFile( "MG4J", ".data" );		cacheDataIn = new FastBufferedInputStream( new FileInputStream( tempFile ) );		cacheDataOut = new CachingOutputBitStream( tempFile, tempBufferSize );		cacheDataLength = new int[two2h];		cachePointer = new OutputBitStream[two2h];		cachePointerByte = new FastByteArrayOutputStream[two2h];				for ( int i = 0; i < two2h; i++ ) 			cachePointer[i] = new OutputBitStream( cachePointerByte[i] = new FastByteArrayOutputStream(), 0 );		cacheSkip = new OutputBitStream[two2h];		cacheSkipBitCount = new OutputBitStream[two2h];		cacheSkipByte = new FastByteArrayOutputStream[two2h];		for ( int i = 0; i < two2h; i++ ) {			cacheSkip[ i ] = new OutputBitStream( cacheSkipByte[i] = new FastByteArrayOutputStream(), 0 );			cacheSkipBitCount[ i ] = new OutputBitStream( NullOutputStream.getInstance(), 0 );		}			skipPointer = new int[ two2h + 1 ];		distance = new long[ two2h + 1 ];		bitCount = new OutputBitStream( NullOutputStream.getInstance(), 0 );		towerTopB = new int[ h + 1 ];		towerTopLog2B = new int[ h + 1 ];		towerLowerB = new int[ h + 1 ];		towerLowerLog2B = new int[ h + 1 ];		pointerPrediction = new int[ h + 1 ];		if ( STATS ) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -