⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bitstreamindexwriter.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package it.unimi.dsi.mg4j.index;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2003-2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.bits.Fast;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.mg4j.io.InterpolativeCoding;import it.unimi.dsi.util.Properties;import jal.ints.Sorting;import java.io.FileOutputStream;import java.io.IOException;import java.util.Map;/** Writes a bitstream-based interleaved index. * * <H2>Offsets bit stream</H2> * * <P>An inverted index may have an associated {@link OutputBitStream} of * offsets: this file contains <code>T+1</code> integers, where <code>T</code> * is the number of inverted lists (i.e., the number of terms), and the * <code>i</code>-th entry is a suitable coding of the position in bits where * the <code>i</code>-th inverted list starts (the last entry is actually the * length, in bytes, of the inverted index file itself). The coding used for * the offset stream is a &gamma; code of the difference between the current * position and the last position. * * @author Paolo Boldi  * @author Sebastiano Vigna  * @since 0.6 */public class BitStreamIndexWriter extends AbstractBitStreamIndexWriter {	private static final boolean ASSERTS = false;		/** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/	protected static final int BEFORE_INVERTED_LIST = 0;	/** This value of {@link #state} means that we are positioned at the start of an inverted list,	 * and we should call {@link #writeFrequency(int)}.*/	protected static final int BEFORE_FREQUENCY = 1;	/** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */	protected static final int BEFORE_DOCUMENT_RECORD = 2;	/** This value of {@link #state} means that we just started a new document record, and we	 * should call {@link #writeDocumentPointer(OutputBitStream, int)}. */	protected static final int BEFORE_POINTER = 3;	/** This value of {@link #state} can be assumed only in indices that contain payloads; it	 * means that we are positioned just before the payload for the current document record. */	protected static final int BEFORE_PAYLOAD = 4;	/** This value of {@link #state} can be assumed only in indices that contain counts; it	 * means that we are positioned just before the count for the current document record. */	protected static final int BEFORE_COUNT = 5;	/** This value of {@link #state} can be assumed only in indices that contain document positions; 	 * it means that we are positioned just before the position list of the current document record. */	protected static final int BEFORE_POSITIONS = 6;	/** This is the first unused state. Subclasses may start from this value to define new states. */	protected static final int FIRST_UNUSED_STATE = 7;	/** The underlying {@link OutputBitStream}. */	protected OutputBitStream obs;	/** The offset {@link OutputBitStream}. */	private OutputBitStream offset;	/** The current state of the writer. */	protected int state;	/** The number of document records that the current inverted list will contain. */	protected int frequency;	/** The number of document records already written for the current inverted list. */	protected int writtenDocuments;	/** The current document pointer. */	protected int currentDocument;	/** The last document pointer in the current list. */	protected int lastDocument;	/** The position (in bytes) where the last inverted list started. */	private long lastInvertedListPos;	/** The parameter <code>b</code> for Golomb coding of pointers. */	protected int b;	/** The parameter <code>log2b</code> for Golomb coding of pointers; it is the most significant bit of {@link #b}. */	protected int log2b;	/** The maximum number of positions in a document record so far. */	public int maxCount;	/** A temporary array for skewed Golomb coding. */	private int[] sortedDeltas = IntArrays.EMPTY_ARRAY;	/** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with <samp>.index</samp>).	 *  If <code>writeOffsets</code>, also an offset file will be produced (stemmed with <samp>.offsets</samp>). 	 *  When {@link #close()} will be called, the property file will also be produced (stemmed with <samp>.properties</samp>),	 *  or enriched if it already exists.	 * 	 * @param basename the basename.	 * @param numberOfDocuments the number of documents in the collection to be indexed.	 * @param writeOffsets if <code>true</code>, the offset file will also be produced.	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 */	public BitStreamIndexWriter( final CharSequence basename, final int numberOfDocuments, final boolean writeOffsets, final Map<Component,Coding> flags ) throws IOException {		this( 			new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.INDEX_EXTENSION ) ),			writeOffsets? new OutputBitStream( new FileOutputStream( basename + DiskBasedIndex.OFFSETS_EXTENSION) ) : null,			numberOfDocuments,			flags		 );	}	/** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.	 *	 * @param obs the underlying output bit stream.	 * @param offset the offset bit stream, or <code>null</code> if offsets should not be written.	 * @param numberOfDocuments the number of documents in the collection to be indexed.	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 */	public BitStreamIndexWriter( final OutputBitStream obs, final OutputBitStream offset, final int numberOfDocuments, final Map<Component,Coding> flags ) {		super( numberOfDocuments, flags );		this.obs = obs;		this.offset = offset;		this.frequency = -1;		this.currentTerm = -1;		this.maxCount = 0;		if ( ! hasCounts && hasPositions ) throw new IllegalArgumentException( "Index would have positions but no counts (this can't happen)" );	}	/** Creates a new index writer, with the specified underlying {@link OutputBitStream},	 *  without an associated offset bit stream.	 *	 * @param obs the underlying output bit stream.	 * @param numberOfDocuments the number of documents in the collection to be indexed.	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).	 */	public BitStreamIndexWriter( final OutputBitStream obs, final int numberOfDocuments, final Map<Component,Coding> flags ) {		this( obs, null, numberOfDocuments, flags );	}	public long newInvertedList() throws IOException {		if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );		if ( state != BEFORE_INVERTED_LIST && state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new inverted list in state " + state );		// The position (in bits) where the new inverted list starts		long pos = obs.writtenBits();		// Reset variables		writtenDocuments = 0;		currentTerm++;		currentDocument = -1;		// If needed, write the offset		if ( offset != null ) offset.writeLongGamma( pos - lastInvertedListPos );		lastInvertedListPos = pos;		state = BEFORE_FREQUENCY;		return pos;	}	public int writeFrequency( final int frequency ) throws IOException {		if ( state != BEFORE_FREQUENCY ) throw new IllegalStateException( "Trying to write frequency in state " + state );		int bitCount;		// Write the frequency		switch( frequencyCoding ) {		case SHIFTED_GAMMA:			bitCount = obs.writeShiftedGamma( frequency - 1 ); // frequency cannot be 0			break;		case GAMMA:			bitCount = obs.writeGamma( frequency - 1 ); // frequency cannot be 0			break;		case DELTA:			bitCount = obs.writeDelta( frequency - 1 ); // frequency cannot be 0			break;		default:			throw new IllegalStateException( "The required frequency coding (" + frequencyCoding + ") is not supported." );		}		this.frequency = frequency;		// We compute the modulus used for pointer Golomb coding 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -