📄 bitstreamhpindexwriter.java
字号:
private int writeOutPointer( final OutputBitStream out, final int pointer ) throws IOException { if ( frequency == numberOfDocuments ) return 0; // We do not write pointers for everywhere occurring terms. switch ( pointerCoding ) { case GAMMA: return out.writeGamma( pointer - lastDocument - 1 ); case DELTA: return out.writeDelta( pointer - lastDocument - 1 ); case GOLOMB: return out.writeGolomb( pointer - lastDocument - 1, b, log2b ); default: throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." ); } } /** A structure maintaining statistical data about tower construction. */ public static class TowerData { /** The number of bits written for bit skips at the top of a tower. */ public long bitsForTopBitSkips; /** The number of bits written for positions bit skips at the top of a tower. */ public long bitsForTopPositionsBitSkips; /** The number of bits written for skip pointers at the top of a tower. */ public long bitsForTopSkipPointers; /** The number of bits written for bit skips in the lower part of a tower. */ public long bitsForLowerBitSkips; /** The number of bits written for positions bit skips in the lower part of a tower. */ public long bitsForLowerPositionsBitSkips; /** The number of bits written for skip pointers in the lower part of a tower. */ public long bitsForLowerSkipPointers; /** The number of bits written for tower lengths. */ public long bitsForTowerLengths; /** The number of written skip towers. */ public long numberOfSkipTowers; /** The number of written top skip entries. */ public long numberOfTopEntries; /** The number of written lower skip entries. */ public long numberOfLowerEntries; /** Clear all fields of this tower data. */ void clear() { bitsForTopBitSkips = 0; bitsForTopPositionsBitSkips = 0; bitsForTopSkipPointers = 0; bitsForLowerBitSkips = 0; bitsForLowerPositionsBitSkips = 0; bitsForLowerSkipPointers = 0; bitsForTowerLengths = 0; numberOfSkipTowers = 0; numberOfTopEntries = 0; numberOfLowerEntries = 0; } /** Returns the overall number of bits used for skip pointers. * @return the overall number of bits used for skip pointers. */ public long bitsForSkipPointers() { return bitsForTopSkipPointers + bitsForLowerSkipPointers; } /** Returns the overall number of bits used for bit skips. * @return the overall number of bits used for bit skips. */ public long bitsForBitSkips() { return bitsForTopBitSkips + bitsForLowerBitSkips; } /** Returns the overall number of bits used for bit skips. * @return the overall number of bits used for bit skips. */ public long bitsForPositionsBitSkips() { return bitsForTopPositionsBitSkips + bitsForLowerPositionsBitSkips; } /** Returns the overall number of bits used for tower entries (bits for tower lengths are not included). * @return the overall number of bits used for tower entries. */ public long bitsForEntries() { return bitsForSkipPointers() + bitsForBitSkips() + bitsForPositionsBitSkips(); } /** Returns the overall number of bits used for towers. * @return the overall number of bits used for towers. */ public long bitsForTowers() { return bitsForTowerLengths + bitsForEntries(); } /** Returns the overall number of entries. * @return the overall number of entries. */ public long numberOfEntries() { return numberOfTopEntries + numberOfLowerEntries; } } public long newInvertedList() throws IOException { if ( cache != 0 ) writeOutCache( -1 ); if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" ); if ( state != BEFORE_INVERTED_LIST && state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new inverted list in state " + state ); // The position (in bits) where the new inverted list starts long pos = obs.writtenBits(); // Reset variables writtenDocuments = 0; currentTerm++; currentDocument = -1; // If needed, write the offset if ( offset != null ) offset.writeLongGamma( pos - lastInvertedListPos ); // Write the offset for positions bitsForPositionsOffsets += obs.writeLongDelta( positions.writtenBits() ); lastInvertedListPos = pos; state = BEFORE_FREQUENCY; return pos; } public int writeFrequency( final int frequency ) throws IOException { if ( state != BEFORE_FREQUENCY ) throw new IllegalStateException( "Trying to write frequency in state " + state ); int bitCount; // Write the frequency switch( frequencyCoding ) { case SHIFTED_GAMMA: bitCount = obs.writeShiftedGamma( frequency - 1 ); // frequency cannot be 0 break; case GAMMA: bitCount = obs.writeGamma( frequency - 1 ); // frequency cannot be 0 break; case DELTA: bitCount = obs.writeDelta( frequency - 1 ); // frequency cannot be 0 break; default: throw new IllegalStateException( "The required frequency coding (" + frequencyCoding + ") is not supported." ); } this.frequency = frequency; // We compute the modulus used for pointer Golomb coding if ( pointerCoding == Coding.GOLOMB ) { b = BitStreamIndex.golombModulus( frequency, numberOfDocuments ); log2b = Fast.mostSignificantBit( b ); } prevQuantumBitLength = prevEntryBitLength = prevPositionsQuantumBitLength = -1; if ( DEBUG ) System.err.println( "----------- " + currentTerm + " (" + frequency + ")" ); final long pointerQuantumSigma = BitStreamIndex.quantumSigma( frequency, numberOfDocuments, q ); for( int i = Math.min( h, Fast.mostSignificantBit( frequency / q ) ); i >= 0; i-- ) { towerTopB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i + 1 ); towerTopLog2B[ i ] = Fast.mostSignificantBit( towerTopB[ i ] ); towerLowerB[ i ] = BitStreamIndex.gaussianGolombModulus( pointerQuantumSigma, i ); towerLowerLog2B[ i ] = Fast.mostSignificantBit( towerLowerB[ i ] ); pointerPrediction[ i ] = (int)( ( q * ( 1L << i ) * numberOfDocuments + frequency / 2 ) / frequency ); } state = BEFORE_DOCUMENT_RECORD; bitsForFrequencies += bitCount; return bitCount; } public OutputBitStream newDocumentRecord() throws IOException { if ( frequency == writtenDocuments ) throw new IllegalStateException( "Document record overflow (written " + this.frequency + " already)" ); if ( state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new document record in state " + state ); writtenDocuments++; numberOfPostings++; lastDocument = currentDocument; state = BEFORE_POINTER; return cacheDataOut; } public int writeDocumentPointer( @SuppressWarnings("unused") final OutputBitStream unused, final int pointer ) throws IOException { if ( state != BEFORE_POINTER ) throw new IllegalStateException( "Trying to write pointer in state " + state ); // If the previous block is over, write it out! if ( cache == w ) writeOutCache( pointer ); final OutputBitStream out; // Record data pointer if we are on a skip; otherwise, write it to the cache. if ( cache % q == 0 ) { if ( cache / q > 0 ) { cacheDataLength[ cache / q - 1 ] = (int)cacheDataOut.writtenBits(); if ( ASSERTS ) assert positions.writtenBits() - writtenPositionsBitsAtLastQuantum <= Integer.MAX_VALUE : ( positions.writtenBits() - writtenPositionsBitsAtLastQuantum ) + " > " + Integer.MAX_VALUE; cachePositionsLength[ cache / q -1 ] = (int)( positions.writtenBits() - writtenPositionsBitsAtLastQuantum ); writtenPositionsBitsAtLastQuantum = positions.writtenBits(); } cacheDataOut.align(); cacheDataOut.writtenBits( 0 ); skipPointer[ cache / q ] = pointer; out = cachePointer[ cache++ / q ]; } else { cache++; out = cacheDataOut; } currentDocument = pointer; int bitCount = 0; if ( frequency != numberOfDocuments ) { // We do not write pointers for everywhere occurring documents. switch( pointerCoding ) { case SHIFTED_GAMMA: bitCount = out.writeShiftedGamma( pointer - lastDocument - 1 ); break; case GAMMA: bitCount = out.writeGamma( pointer - lastDocument - 1 ); break; case DELTA: bitCount = out.writeDelta( pointer - lastDocument - 1 ); break; case GOLOMB: bitCount = out.writeGolomb( pointer - lastDocument - 1, b, log2b ); break; default: throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." ); } } else if ( pointer - lastDocument != 1 ) throw new IllegalStateException( "Term " + currentTerm + " has frequency equal to the number of documents, but pointers are not consecutive integers" ); state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD; bitsForPointers += bitCount; return bitCount; } public int writePayload( final OutputBitStream out, final Payload payload ) throws IOException { throw new IllegalStateException( "High-performance indices do not support payloads" ); /*if ( frequency < 0 ) throw new IllegalStateException( "Trying to write payload without calling newInvertedList" ); if ( state != BEFORE_PAYLOAD ) throw new IllegalStateException( "Trying to write payload in state " + state ); final int count = payload.write( out ); bitsForPayloads += count; state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD; return count;*/ } public int writePositionCount( final OutputBitStream out, final int count ) throws IOException { if ( frequency < 0 ) throw new IllegalStateException( "Trying to write count without calling newInvertedList" ); if ( state != BEFORE_COUNT ) throw new IllegalStateException( "Trying to write count in state " + state ); final int bitCount; numberOfOccurrences += count; switch( countCoding ) { case SHIFTED_GAMMA: bitCount = out.writeShiftedGamma( count - 1 ); break; case GAMMA: bitCount = out.writeGamma( count - 1 ); break; case UNARY: bitCount = out.writeUnary( count - 1 ); break; case DELTA: bitCount = out.writeDelta( count - 1 ); break; default: throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." ); } state = hasPositions ? BEFORE_POSITIONS : BEFORE_DOCUMENT_RECORD; bitsForCounts += bitCount; return bitCount; } public int writeDocumentPositions( @SuppressWarnings("unused") final OutputBitStream unused, final int[] occ, final int offset, final int len, final int docSize ) throws IOException { if ( frequency < 0 ) throw new IllegalStateException( "Trying to write occurrences without calling newInvertedList" ); if ( state != BEFORE_POSITIONS ) throw new IllegalStateException( "Trying to write positions in state " + state ); if ( ASSERTS && docSize > 0 ) for( int i = 0; i< len; i++ ) assert occ[ offset + i ] < docSize : "Position " + occ[ offset + i ] + " for document " + currentDocument + " is too large; size is " + docSize; int i; int prev = -1; int bitCount = 0; final int end = offset + len; final OutputBitStream positions = this.positions; switch( positionCoding ) { case GAMMA: if ( COOKIES ) bitCount += positions.writeGamma( Integer.MAX_VALUE ); for( i = offset; i < end; i++ ) { bitCount += positions.writeGamma( occ[ i ] - prev - 1 ); prev = occ[ i ]; } break; case DELTA: if ( COOKIES ) bitCount += positions.writeDelta( Integer.MAX_VALUE ); for( i = offset; i < end; i++ ) { bitCount += positions.writeDelta( occ[ i ] - prev - 1 ); prev = occ[ i ]; } break; case SHIFTED_GAMMA: if ( COOKIES ) bitCount += positions.writeShiftedGamma( Integer.MAX_VALUE ); for( i = offset; i < end; i++ ) { bitCount += positions.writeShiftedGamma( occ[ i ] - prev - 1 ); prev = occ[ i ]; } break; default: throw new IllegalStateException( "The required position coding (" + positionCoding + ") is not supported." ); } state = BEFORE_DOCUMENT_RECORD; bitsForPositions += bitCount; if ( len > maxCount ) maxCount = len; return bitCount; } public void close() throws IOException { if ( cache != 0 ) writeOutCache( -1 ); if ( state != BEFORE_DOCUMENT_RECORD && state != BEFORE_INVERTED_LIST ) throw new IllegalStateException( "Trying to close index in state " + state ); if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" ); if ( writtenBits() != obs.writtenBits() + positions.writtenBits() ) throw new IllegalStateException( "Written bits count mismatch: we say " + writtenBits() + ", the streams say " + ( obs.writtenBits() + positions.writtenBits() ) ); if ( offset != null ) { offset.writeLongGamma( obs.writtenBits() - lastInvertedListPos ); offset.close(); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -