⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bitstreamhpindexwriter.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
		obs.close();		positions.close();		cacheDataIn.close();		cacheDataOut.close();		tempFile.delete();	}		/** Computes the towers.	 * 	 * @param quantumBitLength the length in bits of a quantum.	 * @param positionsQuantumBitLength the length in bits of a quantum in the positions stream.	 * @param entryBitLength the estimated length in bits of a tower entry.	 * @param toTheEnd the number of bits that must be skipped to reach the next tower (usually,	 * the length of the first pointer of the next block or 0 if this is to be the last block).	 * @param skip an array of output bit stream where the data related to each tower will be written.	 * @param towerData will be filled with statistical date about the towers.	 * @param doinIt if true, we are actually writing a tower, not just trying.	 */	private void tryTower( final int quantumBitLength, final int positionsQuantumBitLength, final int entryBitLength, long toTheEnd, final OutputBitStream[] skip, final TowerData towerData, final boolean doinIt ) throws IOException {		int i, k, s;		long d;		int basePointer;		// truncated is true only for those towers (in defective blocks) whose height is strictly smaller than the height they should have			boolean truncated = false;		if ( DEBUG && doinIt ) System.err.println( "Writing out tower for term " + currentTerm + "; quantumBitLength=" + quantumBitLength + " entryBitLength=" + entryBitLength );				for ( k = ( cache - 1 ) / q; k >= 0; k-- ) {			// Where are we? At the end of the k-th quantum. So toTheEnd must be increased by			// the length of the data contained in the same quantum, moving us...			toTheEnd += cacheDataLength[ k ];			// ...just after the k-th skip tower.			// We compute the maximum valid index of the skip tower (*MUST* be kept in sync with the subsequent loop).			s = ( k == 0 ) ? h : Fast.leastSignificantBit( k );			// This test handles defective blocks. In particular, for defective quanta s=-1,			// yielding no skipping data at all for such quanta. truncated is true if the			// current tower is truncated w.r.t. the infinite skip list.			if ( cache < w ) {				final int upperBound = Fast.mostSignificantBit( ( cache / q ) - k );				if ( s > upperBound ) {					s = upperBound;					truncated = true;				} else truncated = false;			}			else truncated = k == 0;						skip[ k ].writtenBits( 0 );			if ( s >= 0 ) {				if ( DEBUG && doinIt ) System.err.print( "% (" + k + ") [" + skipPointer[ k ] + "] " );				basePointer = skipPointer[ k ];				/* If the current tower is truncated, we must actually write the top of the tower.				 * The top must be forecast in a Bernoullian way: we write it as a difference from the average pointer skip, 				 * which is q 2^s / relativeFrequency. */				if ( truncated ) {					towerData.numberOfTopEntries++;					// TODO: prediction should be based not on 1<<s, but rather on the actual number of skipped quanta, which could be smaller (because of end-of-list)					towerData.bitsForTopSkipPointers += skip[k].writeGolomb( Fast.int2nat( skipPointer[ k + ( 1 << s ) ] - basePointer - pointerPrediction[ s ] ), towerTopB[ s ], towerTopLog2B[ s ] );					towerData.bitsForTopBitSkips += skip[k].writeLongDelta( Fast.int2nat( 						(int) ( ( toTheEnd - distance[k + ( 1 << s )] ) -								( quantumBitLength * ( 1 << s ) + entryBitLength * ( ( 1 << s + 1 ) - s - 2 ) ) ) )					);					towerData.bitsForTopPositionsBitSkips += skip[k].writeLongDelta( 							Fast.int2nat( ( cachePositionsLength[ k ] - cachePositionsLength[ k + ( 1 << s ) ] ) - positionsQuantumBitLength * ( 1 << s ) ) );				}								if ( DEBUG && doinIt ) System.err.print( ( truncated ? "" : "(" ) + ( skipPointer[ k + ( 1 << s ) ] - basePointer ) + ":" + ( toTheEnd - distance[k + ( 1 << s )] ) + ( truncated ? " " : ") " ) );				// Produce a (single) tower of height s				for ( i = s - 1; i >= 0; i-- ) {					towerData.bitsForLowerSkipPointers += skip[k].writeGolomb( 						Fast.int2nat( ( skipPointer[k + ( 1 << i )] - basePointer ) - ( ( skipPointer[k + ( 1 << i + 1 )] - basePointer ) / 2 ) ),						towerLowerB[ i ], towerLowerLog2B[ i ] 					);					towerData.bitsForLowerBitSkips += skip[k].writeLongDelta( 						Fast.int2nat( (int) ( ( ( toTheEnd - distance[k + ( 1 << ( i + 1 ) )] - entryBitLength * ( i + 1 ) ) / 2 ) -								( toTheEnd - distance[k + ( 1 << i )] ) ) ) 					);					towerData.bitsForLowerPositionsBitSkips += skip[k].writeLongDelta( 							Fast.int2nat( ( cachePositionsLength[ k ] - cachePositionsLength[k + ( 1 << i + 1 )] ) / 2 -									( cachePositionsLength[ k ] - cachePositionsLength[k + ( 1 << i )] ) ) 						);					if ( DEBUG && doinIt ) System.err.print( ( skipPointer[k + ( 1 << i )] - basePointer ) + ":" + ( toTheEnd - distance[k + ( 1 << i )] ) + " " );				}				if ( s > 0 ) { // No length for single-entry towers.					d = bitCount.writeDelta( Fast.int2nat( (int) skip[k].writtenBits() - ( s + 1 ) * entryBitLength ) );					towerData.bitsForTowerLengths += d;					toTheEnd += d;				}				toTheEnd += skip[k].writtenBits();				if ( DEBUG && doinIt ) System.err.print( " (" + (int) skip[k].writtenBits() + " bits)" );				towerData.numberOfLowerEntries += s;				towerData.numberOfSkipTowers++;				if ( DEBUG && doinIt ) System.err.println();			}			distance[ k ] = toTheEnd;			// Where are we? Just before the beginning of the k-th skip tower			toTheEnd += cachePointer[ k ].writtenBits();			// Where are we? Just before the beginning of the k-th document record		}	}	/** Write out the cache content.	 * 	 * @param nextPointer the first pointer of the next block, or -1 if this is the last block.	 */	private void writeOutCache( final int nextPointer ) throws IOException {		if ( DEBUG ) System.err.println( "Entered writeOutCache() with cache=" + cache + " (H is " + ( 1 << h ) + ", B is " + w + ")" );		cacheDataLength[ ( cache + q - 1 ) / q - 1 ] = (int)cacheDataOut.writtenBits();		if ( ASSERTS ) assert positions.writtenBits() - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : ( positions.writtenBits() - writtenPositionsBitsAtLastQuantum ) + " > " + Integer.MAX_VALUE;		cachePositionsLength[ ( cache + q - 1 ) / q - 1 ] = (int)( positions.writtenBits() - writtenPositionsBitsAtLastQuantum );		cachePositionsLength[ ( cache + q - 1 ) / q ] = 0;		writtenPositionsBitsAtLastQuantum = positions.writtenBits();				/* We cumulate the position lengths so to obtain the actual skips. */		for( int i = ( cache + q - 1 ) / q; i-- != 0; ) cachePositionsLength[ i ] += cachePositionsLength[ i + 1 ];				//System.err.print( basename ); for( int i = 0; i < ( cache + q - 1 ) / q; i++ ) System.err.print( " " + cachePositionsLength[ i ] ); System.err.println();				/* Number of bits to go after the first pointer of the first record of the next block (or, if there		   is no other block in the current list, to go to the end of the list). */		long toTheEnd;		// Record the new document pointer for the highest tower		int nextAfter = ( ( cache + q ) - 1 ) / q; // This is ceil( cache / q )		if ( nextPointer >= 0 ) {			skipPointer[nextAfter] = nextPointer;			toTheEnd = writeOutPointer( bitCount, nextPointer );		} else {			skipPointer[nextAfter] = currentDocument + 1; // Fake: just for the last block			toTheEnd = 0;		}		distance[nextAfter] = 0;		int k, s;		long d;		// Compute quantum length in bits (without towers)		int quantumBitLength = 0, entryBitLength = 0, positionsQuantumBitLength = (int)( ( cachePositionsLength[ 0 ] * q + ( cache -1 ) ) / cache );		for ( d = k = 0; k <= ( ( cache - 1 ) / q ); k++ ) d += ( cachePointer[k].writtenBits() + cacheDataLength[ k ] );		quantumBitLength = (int)( ( ( d * q ) + ( cache - 1 ) ) / cache );		final TowerData td = new TowerData();		final Int2IntRBTreeMap candidates = new Int2IntRBTreeMap(); 		/* As a first try, we compute the tower costs using 0 as average entry bit length. */		tryTower( quantumBitLength, positionsQuantumBitLength, 0, toTheEnd, cacheSkipBitCount, td, false );				if ( td.numberOfSkipTowers > 0 ) { // There actually is at least a tower.			/* Now we repeat this operation, trying to obtain the best value for the			 * average entry bit length. 			 */			while( candidates.size() < MAX_TRY && ! candidates.containsValue( entryBitLength = (int)( td.bitsForTowers() / td.numberOfEntries() ) ) ) {				td.clear();				tryTower( quantumBitLength, positionsQuantumBitLength, entryBitLength, toTheEnd, cacheSkipBitCount, td, false );				candidates.put( (int)( td.bitsForTowers() / td.numberOfEntries() ), entryBitLength );			}			if ( ASSERTS ) assert candidates.size() < MAX_TRY;			entryBitLength = candidates.get( candidates.firstIntKey() );			if ( DEBUG ) System.err.println( "Going to write tower at position " + obs.writtenBits() );			tryTower( quantumBitLength, positionsQuantumBitLength, entryBitLength, toTheEnd, cacheSkip, towerData, true );		}		// Ready to write out cache		int maxCacheDataLength = 0;		for ( k = 0; k <= ( ( cache - 1 ) / q ); k++ ) if ( cacheDataLength[ k ] > maxCacheDataLength ) maxCacheDataLength = cacheDataLength[ k ];  				/* We have two ways of writing out cached data. If all the data is still in the output bit		 * stream buffer, we just read it directly. Otherwise, we have to pour it into a temporary buffer. */				final byte[] buffer;		final boolean direct;		int pos = 0;				cacheDataOut.align();		if ( cacheDataOut.buffer() != null ) {			buffer = cacheDataOut.buffer();			direct = true;		}		else {			cacheDataOut.flush();			buffer = new byte[ ( maxCacheDataLength + 7 ) / 8 ];			direct = false;			cacheDataIn.flush();			cacheDataIn.position( 0 );		}				for ( k = 0; k <= ( ( cache - 1 ) / q ); k++ ) {			/* See comments above. */			s = ( k == 0 ) ? h : Fast.leastSignificantBit( k );			if ( cache < w ) s = Math.min( s, Fast.mostSignificantBit( ( cache / q ) - k ) );			d = cachePointer[k].writtenBits();			cachePointer[k].flush();			obs.write( cachePointerByte[k].array, d );			d = cacheSkip[k].writtenBits();			cacheSkip[k].flush();			if ( s >= 0 ) {				if ( k == 0 ) {					if ( prevQuantumBitLength < 0 ) {						bitsForQuantumBitLengths += obs.writeLongDelta( quantumBitLength );						bitsForPositionsQuantumBitLengths += obs.writeLongDelta( positionsQuantumBitLength );						bitsForEntryBitLengths += obs.writeLongDelta( entryBitLength );					}					else {						bitsForQuantumBitLengths += obs.writeLongDelta( Fast.int2nat( quantumBitLength - prevQuantumBitLength ) );						bitsForPositionsQuantumBitLengths += obs.writeLongDelta( Fast.int2nat( positionsQuantumBitLength - prevPositionsQuantumBitLength ) );						bitsForEntryBitLengths += obs.writeLongDelta( Fast.int2nat( entryBitLength - prevEntryBitLength ) );					}					prevQuantumBitLength = quantumBitLength;					prevPositionsQuantumBitLength = positionsQuantumBitLength;					prevEntryBitLength = entryBitLength;					numberOfBlocks++;				}				if ( s > 0 ) obs.writeDelta( Fast.int2nat( (int)d - entryBitLength * ( s + 1 ) ) ); // No length for single-entry towers.			} else if ( ASSERTS ) assert d == 0;			obs.write( cacheSkipByte[k].array, d );						if ( direct ) {				obs.write( buffer, pos * 8, cacheDataLength[ k ] );				pos += ( cacheDataLength[ k ] + 7 ) / 8;			}			else {				cacheDataIn.read( buffer, 0, ( cacheDataLength[ k ] + 7 ) / 8 );				obs.write( buffer, cacheDataLength[ k ] );			}		}		// Clean used caches		for ( k = 0; k <= ( ( cache - 1 ) / q ); k++ ) {			cachePointerByte[k].reset();			cachePointer[k].writtenBits( 0 );			cacheSkipByte[k].reset();			cacheSkip[k].writtenBits( 0 );			cacheDataOut.position( 0 );			cacheDataOut.writtenBits( 0 );		}		cache = 0;		if ( ASSERTS ) assert obs.writtenBits() + positions.writtenBits() == writtenBits();	}	public long writtenBits() {		return bitsForFrequencies + bitsForPointers + bitsForPayloads + bitsForCounts + bitsForPositions + bitsForPositionsOffsets + 		towerData.bitsForTowers() + bitsForQuantumBitLengths + bitsForPositionsQuantumBitLengths + bitsForEntryBitLengths;	}	public Properties properties() {		Properties result = new Properties();		result.setProperty( Index.PropertyKeys.DOCUMENTS, numberOfDocuments );		result.setProperty( Index.PropertyKeys.TERMS, currentTerm + 1 );		result.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings );		result.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );		result.setProperty( Index.PropertyKeys.INDEXCLASS, FileHPIndex.class.getName() );		result.setProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM, q );		result.setProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT, h );		if ( COOKIES ) result.setProperty( "cookies", true );		// We save all flags, except for the PAYLOAD component, which is just used internally.		for( Map.Entry<Component,Coding> e: flags.entrySet() )			if ( e.getKey() != Component.PAYLOADS ) result.addProperty( Index.PropertyKeys.CODING, new MutableString().append( e.getKey() ).append( ':' ).append( e.getValue() ) );		return result;	}	public void printStats( final PrintStream stats ) {		super.printStats( stats );		stats.println( "Skip towers: " + Util.format( towerData.numberOfSkipTowers ) + " (" + 				Util.format( towerData.bitsForTowers() ) + " bits [" + 				Util.format( towerData.bitsForTowers()*100.0/writtenBits() ) + "%], " + 				Util.format( towerData.bitsForTowers()/ (double)towerData.numberOfSkipTowers ) + " bits/tower)" );		stats.println( "Skip entries: " + Util.format( towerData.numberOfEntries() ) + " (" + 				Util.format( towerData.bitsForEntries() / (double)towerData.numberOfEntries()) + " bits/entry)" );		// Note that lengths are written approximately every other tower.		stats.println( "Skip tower lengths: " + Util.format( towerData.bitsForTowerLengths ) + " bits (" + Util.format( 2.0 * towerData.bitsForTowerLengths/ towerData.numberOfSkipTowers ) + " bits/tower)" );		stats.println( "Quantum bit lengths: " + Util.format( bitsForQuantumBitLengths ) + " bits (" + Util.format( bitsForQuantumBitLengths/ (double)numberOfBlocks ) + " bits/block)" );		stats.println( "Positions quantum bit lengths: " + Util.format( bitsForPositionsQuantumBitLengths ) + " bits (" + Util.format( bitsForPositionsQuantumBitLengths / (double)numberOfBlocks ) + " bits/block)" );		stats.println( "Entry bit lengths: " + Util.format(bitsForEntryBitLengths ) + " bits (" + Util.format( bitsForEntryBitLengths/ (double)numberOfBlocks ) + " bits/block)" );				stats.println( "Top bit skips: " + Util.format(towerData.bitsForTopBitSkips ) + " bits (" + Util.format( towerData.bitsForTopBitSkips / (double)towerData.numberOfTopEntries ) + " bits/skip)" );		stats.println( "Top positions bit skips: " + Util.format(towerData.bitsForTopPositionsBitSkips ) + " bits (" + Util.format( towerData.bitsForTopPositionsBitSkips / (double)towerData.numberOfTopEntries ) + " bits/skip)" );		stats.println( "Top pointer skips: " + Util.format( towerData.bitsForTopSkipPointers ) + " bits (" + Util.format( towerData.bitsForTopSkipPointers/ (double)towerData.numberOfTopEntries ) + " bits/skip)" );		stats.println( "Lower bit skips: " + Util.format( towerData.bitsForLowerBitSkips ) + " bits (" + Util.format( towerData.bitsForLowerBitSkips/ (double)towerData.numberOfLowerEntries ) + " bits/skip)" );		stats.println( "Lower positions bit skips: " + Util.format( towerData.bitsForLowerPositionsBitSkips ) + " bits (" + Util.format( towerData.bitsForLowerPositionsBitSkips/ (double)towerData.numberOfLowerEntries ) + " bits/skip)" );		stats.println( "Lower pointer skips: " + Util.format( towerData.bitsForLowerSkipPointers ) + " bits (" + Util.format( towerData.bitsForLowerSkipPointers/ (double)towerData.numberOfLowerEntries ) + " bits/skip)" );		stats.println( "Bit skips: " + Util.format( towerData.bitsForBitSkips() ) + " bits (" + Util.format( towerData.bitsForBitSkips()/ (double)towerData.numberOfEntries() ) + " bits/skip)" );		stats.println( "Positions bit skips: " + Util.format( towerData.bitsForPositionsBitSkips() ) + " bits (" + Util.format( towerData.bitsForPositionsBitSkips()/ (double)towerData.numberOfEntries() ) + " bits/skip)" );		stats.println( "Pointer skips: " + Util.format( towerData.bitsForSkipPointers() ) + " bits (" + Util.format( towerData.bitsForSkipPointers()/ (double)towerData.numberOfEntries() ) + " bits/skip)" );	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -