⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bitstreamhpindexwriter.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
		private int writeOutPointer( final OutputBitStream out, final int pointer ) throws IOException {		if ( frequency == numberOfDocuments ) return 0; // We do not write pointers for everywhere occurring terms.		switch ( pointerCoding ) {			case GAMMA:				return out.writeGamma( pointer - lastDocument - 1 );			case DELTA:				return out.writeDelta( pointer - lastDocument - 1 );			case GOLOMB:				return out.writeGolomb( pointer - lastDocument - 1, b, log2b );			default:				throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );		}	}	/** A structure maintaining statistical data about tower construction. */	public static class TowerData {		/** The number of bits written for bit skips at the top of a tower. */		public long bitsForTopBitSkips;		/** The number of bits written for positions bit skips at the top of a tower. */		public long bitsForTopPositionsBitSkips;		/** The number of bits written for skip pointers at the top of a tower. */		public long bitsForTopSkipPointers;		/** The number of bits written for bit skips in the lower part of a tower. */		public long bitsForLowerBitSkips;		/** The number of bits written for positions bit skips in the lower part of a tower. */		public long bitsForLowerPositionsBitSkips;		/** The number of bits written for skip pointers in the lower part of a tower. */		public long bitsForLowerSkipPointers;		/** The number of bits written for tower lengths. */		public long bitsForTowerLengths;		/** The number of written skip towers. */		public long numberOfSkipTowers;		/** The number of written top skip entries. */		public long numberOfTopEntries;		/** The number of written lower skip entries. */		public long numberOfLowerEntries;		/** Clear all fields of this tower data. */		void clear() {			bitsForTopBitSkips = 0;			bitsForTopPositionsBitSkips = 0;			bitsForTopSkipPointers = 0;			bitsForLowerBitSkips = 0;			bitsForLowerPositionsBitSkips = 0;			bitsForLowerSkipPointers = 0;			bitsForTowerLengths = 0;			numberOfSkipTowers = 0;			numberOfTopEntries = 0;			numberOfLowerEntries = 0;		}		/** Returns the overall number of bits used for skip pointers.		 * @return the overall number of bits used for skip pointers.		 */		public long bitsForSkipPointers() { return bitsForTopSkipPointers + bitsForLowerSkipPointers; }		/** Returns the overall number of bits used for bit skips. 		 * @return the overall number of bits used for bit skips.		 */		public long bitsForBitSkips() { return bitsForTopBitSkips + bitsForLowerBitSkips; }		/** Returns the overall number of bits used for bit skips. 		 * @return the overall number of bits used for bit skips.		 */		public long bitsForPositionsBitSkips() { return bitsForTopPositionsBitSkips + bitsForLowerPositionsBitSkips; }		/** Returns the overall number of bits used for tower entries (bits for tower lengths are not included).		 * @return the overall number of bits used for tower entries.		 */		public long bitsForEntries() { return bitsForSkipPointers() + bitsForBitSkips() + bitsForPositionsBitSkips(); }		/** Returns the overall number of bits used for towers.		 * @return the overall number of bits used for towers.		 */		public long bitsForTowers() { return bitsForTowerLengths + bitsForEntries(); }		/** Returns the overall number of entries.		 * @return the overall number of entries.		 */		public long numberOfEntries() { return numberOfTopEntries + numberOfLowerEntries; }	}			public long newInvertedList() throws IOException {		if ( cache != 0 ) writeOutCache( -1 );		if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );		if ( state != BEFORE_INVERTED_LIST && state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new inverted list in state " + state );		// The position (in bits) where the new inverted list starts		long pos = obs.writtenBits();		// Reset variables		writtenDocuments = 0;		currentTerm++;		currentDocument = -1;		// If needed, write the offset		if ( offset != null ) offset.writeLongGamma( pos - lastInvertedListPos );		// Write the offset for positions		bitsForPositionsOffsets += obs.writeLongDelta( positions.writtenBits() );		lastInvertedListPos = pos;		state = BEFORE_FREQUENCY;		return pos;	}	public int writeFrequency( final int frequency ) throws IOException {		if ( state != BEFORE_FREQUENCY ) throw new IllegalStateException( "Trying to write frequency in state " + state );		int bitCount;		// Write the frequency		switch( frequencyCoding ) {		case SHIFTED_GAMMA:			bitCount = obs.writeShiftedGamma( frequency - 1 ); // frequency cannot be 0			break;		case GAMMA:			bitCount = obs.writeGamma( frequency - 1 ); // frequency cannot be 0			break;		case DELTA:			bitCount = obs.writeDelta( frequency - 1 ); // frequency cannot be 0			break;		default:			throw new IllegalStateException( "The required frequency coding (" + frequencyCoding + ") is not supported." );		}		this.frequency = frequency;		// We compute the modulus used for pointer Golomb coding 		if ( pointerCoding == Coding.GOLOMB ) {			b = BitStreamIndex.golombModulus( frequency, numberOfDocuments ); 			log2b = Fast.mostSignificantBit( b );		}		prevQuantumBitLength = prevEntryBitLength = prevPositionsQuantumBitLength = -1;			if ( DEBUG ) System.err.println( "----------- " + currentTerm + " (" + frequency + ")" );		final long pointerQuantumSigma = BitStreamIndex.quantumSigma( frequency, numberOfDocuments, q );		for( int i = Math.min( h, Fast.mostSignificantBit( frequency / q ) ); i >= 0; i-- ) {			towerTopB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i + 1 );			towerTopLog2B[ i ] = Fast.mostSignificantBit( towerTopB[ i ] );			towerLowerB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i );			towerLowerLog2B[ i ] = Fast.mostSignificantBit( towerLowerB[ i ] );			pointerPrediction[ i ] = (int)( ( q * ( 1L << i ) * numberOfDocuments + frequency / 2 ) / frequency );		}				state = BEFORE_DOCUMENT_RECORD;		bitsForFrequencies += bitCount;		return bitCount;	}	public OutputBitStream newDocumentRecord() throws IOException {		if ( frequency == writtenDocuments ) throw new IllegalStateException( "Document record overflow (written " + this.frequency + " already)" );		if ( state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new document record in state " + state );		writtenDocuments++;		numberOfPostings++;		lastDocument = currentDocument;		state = BEFORE_POINTER;		return cacheDataOut;	}	public int writeDocumentPointer( @SuppressWarnings("unused") final OutputBitStream unused, final int pointer ) throws IOException {		if ( state != BEFORE_POINTER ) throw new IllegalStateException( "Trying to write pointer in state " + state );		// If the previous block is over, write it out!		if ( cache == w ) writeOutCache( pointer );		final OutputBitStream out;				// Record data pointer if we are on a skip; otherwise, write it to the cache.		if ( cache % q == 0 ) {			if ( cache / q > 0 ) {				cacheDataLength[ cache / q - 1 ] = (int)cacheDataOut.writtenBits();				if ( ASSERTS ) assert positions.writtenBits() - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : ( positions.writtenBits() - writtenPositionsBitsAtLastQuantum ) + " > " + Integer.MAX_VALUE;				cachePositionsLength[ cache / q -1 ] = (int)( positions.writtenBits() - writtenPositionsBitsAtLastQuantum );				writtenPositionsBitsAtLastQuantum = positions.writtenBits();			}			cacheDataOut.align();			cacheDataOut.writtenBits( 0 );			skipPointer[ cache / q ] = pointer;			out = cachePointer[ cache++ / q ];		} 		else {			cache++;			out = cacheDataOut;		}		currentDocument = pointer;		int bitCount = 0;		if ( frequency != numberOfDocuments ) { // We do not write pointers for everywhere occurring documents.			switch( pointerCoding ) {				case SHIFTED_GAMMA:					bitCount = out.writeShiftedGamma( pointer - lastDocument - 1 );					break;				case GAMMA:					bitCount = out.writeGamma( pointer - lastDocument - 1 );					break;				case DELTA:					bitCount = out.writeDelta( pointer - lastDocument - 1 );					break;				case GOLOMB:					bitCount = out.writeGolomb( pointer - lastDocument - 1, b, log2b );					break;				default:					throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );			}		}		else if ( pointer - lastDocument != 1 ) throw new IllegalStateException( "Term " + currentTerm + " has frequency equal to the number of documents, but pointers are not consecutive integers" );		state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;		bitsForPointers += bitCount;		return bitCount;	}	public int writePayload( final OutputBitStream out, final Payload payload ) throws IOException {		throw new IllegalStateException( "High-performance indices do not support payloads" );		/*if ( frequency < 0 ) throw new IllegalStateException( "Trying to write payload without calling newInvertedList" );		if ( state != BEFORE_PAYLOAD ) throw new IllegalStateException( "Trying to write payload in state " + state );		final int count = payload.write( out );		bitsForPayloads += count;		state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;		return count;*/	}		public int writePositionCount( final OutputBitStream out, final int count ) throws IOException {		if ( frequency < 0 ) throw new IllegalStateException( "Trying to write count without calling newInvertedList" );		if ( state != BEFORE_COUNT ) throw new IllegalStateException( "Trying to write count in state " + state );		final int bitCount;		numberOfOccurrences += count;		switch( countCoding ) {			case SHIFTED_GAMMA:				bitCount = out.writeShiftedGamma( count - 1 );				break;			case GAMMA:				bitCount = out.writeGamma( count - 1 );				break;			case UNARY:				bitCount = out.writeUnary( count - 1 );				break;			case DELTA:				bitCount = out.writeDelta( count - 1 );				break;			default:				throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." );		}				state = hasPositions ? BEFORE_POSITIONS : BEFORE_DOCUMENT_RECORD;		bitsForCounts += bitCount;		return bitCount;	}	public int writeDocumentPositions( @SuppressWarnings("unused") final OutputBitStream unused, final int[] occ, final int offset, final int len, final int docSize ) throws IOException {		if ( frequency < 0 ) throw new IllegalStateException( "Trying to write occurrences without calling newInvertedList" );		if ( state != BEFORE_POSITIONS ) throw new IllegalStateException( "Trying to write positions in state " + state );		if ( ASSERTS && docSize > 0 ) for( int i = 0; i< len; i++ ) assert occ[ offset + i ] < docSize : "Position " + occ[ offset + i ] + " for document " + currentDocument + " is too large; size is " + docSize;				int i;		int prev = -1;		int bitCount = 0;		final int end = offset + len;		final OutputBitStream positions = this.positions;				switch( positionCoding ) {			case GAMMA:				if ( COOKIES ) bitCount += positions.writeGamma( Integer.MAX_VALUE );				for( i = offset; i < end; i++ ) {					bitCount += positions.writeGamma( occ[ i ] - prev - 1 );					prev = occ[ i ];				}				break;			case DELTA:				if ( COOKIES )  bitCount += positions.writeDelta( Integer.MAX_VALUE );				for( i = offset; i < end; i++ ) {					bitCount += positions.writeDelta( occ[ i ] - prev - 1 );					prev = occ[ i ];				}				break;			case SHIFTED_GAMMA:				if ( COOKIES ) bitCount += positions.writeShiftedGamma( Integer.MAX_VALUE );				for( i = offset; i < end; i++ ) {					bitCount += positions.writeShiftedGamma( occ[ i ] - prev - 1 );					prev = occ[ i ];				}				break;			default:				throw new IllegalStateException( "The required position coding (" + positionCoding + ") is not supported." );		}		state = BEFORE_DOCUMENT_RECORD;		bitsForPositions += bitCount;		if ( len > maxCount ) maxCount = len;		return bitCount;		}		public void close() throws IOException {		if ( cache != 0 ) writeOutCache( -1 );				if ( state != BEFORE_DOCUMENT_RECORD && state != BEFORE_INVERTED_LIST ) throw new IllegalStateException( "Trying to close index in state " + state );		if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );		if ( writtenBits() != obs.writtenBits() + positions.writtenBits() ) 			throw new IllegalStateException( "Written bits count mismatch: we say " + writtenBits() + ", the streams say " + ( obs.writtenBits() + positions.writtenBits() ) );		if ( offset != null ) {			offset.writeLongGamma( obs.writtenBits() - lastInvertedListPos );			offset.close();		}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -