📄 immutableexternalprefixdictionary.java
字号:
final PrefixCodec codec; final BitVector[] codeWord; if ( packedFrequency.length != 0 ) { codec = getPrefixCodec( packedFrequency ); prefixCoder = codec.coder(); decoder = codec.decoder(); codeWord = prefixCoder.codeWords(); } else { // This handles the case of a collection without words codec = null; prefixCoder = null; decoder = null; codeWord = null; } packedFrequency = frequency = null; // We now compress all strings using the given codec mixed with front coding final OutputBitStream output; if ( selfContained ) { final File temp = File.createTempFile( this.getClass().getName(), ".dump" ); temp.deleteOnExit(); tempDumpStreamFilename = temp.toString(); output = new OutputBitStream( temp, blockSizeInBytes ); } else output = new OutputBitStream( tempDumpStreamFilename = dumpStreamFilename.toString(), blockSizeInBytes ); // This array will contain the delimiting words (the ones at the start of each block) boolean isDelimiter; int length, prevTermLength = 0, bits; int prefixLength = 0, termCount = 0; int currBuffer = 0; final IntArrayList blockStarts = new IntArrayList(); final IntArrayList blockOffsets = new IntArrayList(); final ObjectArrayList<MutableString> delimiters = new ObjectArrayList<MutableString>(); prevTerm.length( 0 ); for( Iterator<?> i = terms.iterator(); i.hasNext(); ) { s = (CharSequence) i.next(); length = s.length(); isDelimiter = false; // We compute the common prefix and the number of bits that are necessary to code the next term. bits = 0; for( prefixLength = 0; prefixLength < length && prefixLength < prevTermLength && prevTerm.charAt( prefixLength ) == s.charAt( prefixLength ); prefixLength++ ); for( int j = prefixLength; j < length; j++ ) bits += codeWord[ char2symbol.get( s.charAt( j ) ) ].size(); //if ( bits + length + 1 > blockSize ) throw new IllegalArgumentException( "The string \"" + s + "\" is too long to be encoded with block size " + blockSizeInBytes ); // If the next term would overflow the block, and we are not at the start of a block, we align. if ( output.writtenBits() % blockSize != 0 && output.writtenBits() / blockSize != ( output.writtenBits() + ( length - prefixLength + 1 ) + ( prefixLength + 1 ) + bits - 1 ) / blockSize ) { // We align by writing 0es. if ( DEBUG ) System.err.println( "Aligning away " + ( blockSize - output.writtenBits() % blockSize ) + " bits..." ); for( int j = (int)( blockSize - output.writtenBits() % blockSize ); j-- != 0; ) output.writeBit( 0 ); if ( ASSERTS ) assert output.writtenBits() % blockSize == 0; } if ( output.writtenBits() % blockSize == 0 ) { isDelimiter = true; prefixLength = 0; blockOffsets.add( (int)( output.writtenBits() / blockSize ) ); } // Note that delimiters do not get the prefix length, as it's 0. if ( ! isDelimiter ) output.writeUnary( prefixLength ); output.writeUnary( length - prefixLength ); // Write the next coded suffix on output. for( int j = prefixLength; j < length; j++ ) { BitVector c = codeWord[ char2symbol.get( s.charAt( j ) ) ]; for( int k = 0; k < c.size(); k++ ) output.writeBit( c.get( k ) ); } if ( isDelimiter ) { if ( DEBUG ) System.err.println( "First string of block " + blockStarts.size() + ": " + termCount + " (" + s + ")" ); // The current word starts a new block blockStarts.add( termCount ); // We do not want to rely on s being immutable. delimiters.add( new MutableString( s ) ); } currBuffer = 1 - currBuffer; prevTerm.replace( s ); prevTermLength = length; termCount++; } output.align(); dumpStreamLength = output.writtenBits() / 8; output.close(); intervalApproximator = getIntervalApproximator( delimiters, prefixCoder ); blockStarts.add( size ); blockStart = blockStarts.toIntArray(); blockOffset = blockOffsets.toIntArray(); // We use a buffer of the same size of a block, hoping in fast I/O. */ dumpStream = new InputBitStream( tempDumpStreamFilename, blockSizeInBytes ); } /** Creates an external dictionary with block size {@link #STD_BLOCK_SIZE} and specified dump stream. * * <P>This constructor does not assume that strings returned by <code>terms.iterator()</code> * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the dictionary. * @param dumpStreamFilename the name of the dump stream, or <code>null</code> for a dictionary * with an automatic dump stream. */ public ImmutableExternalPrefixDictionary( final Iterable<? extends CharSequence> terms, final CharSequence dumpStreamFilename ) throws IOException { this( terms, STD_BLOCK_SIZE, dumpStreamFilename ); } /** Creates an external dictionary with specified block size. * * <P>This constructor does not assume that strings returned by <code>terms.iterator()</code> * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param blockSizeInBytes the block size (in bytes). * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the dictionary. */ public ImmutableExternalPrefixDictionary( final Iterable<? extends CharSequence> terms, final int blockSizeInBytes ) throws IOException { this( terms, blockSizeInBytes, null ); } /** Creates an external prefix dictionary with block size {@link #STD_BLOCK_SIZE}. * * <P>This constructor does not assume that strings returned by <code>terms.iterator()</code> * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the dictionary. */ public ImmutableExternalPrefixDictionary( final Iterable<? extends CharSequence> terms ) throws IOException { this( terms, null ); } /** An abstract factory method returning a prefix codec that will be used to compress the dump file. * * <P>Implementing subclasses must provide a codec. The coder will be used * at construction time, and will be passed to {@link #getIntervalApproximator(List, PrefixCoder)}, * whereas the decoder will be available in {@link #decoder}. * * @param frequency the symbol frequencies. * @return the prefix codec that will be used to compress the dump file. */ protected abstract PrefixCodec getPrefixCodec( final int[] frequency ); /** An abstract factory method returning an interval approximator for this external dictionary. * * @param delimiters a list of the words appearing at the start of each block. * @param prefixCoder the prefix coder provided by the prefix coded previously returned by {@link #getPrefixCodec(int[])}; * the returned approximator can use its coder/decoder. * @return an interval approximator for this external dictionary. */ protected abstract IntervalApproximator getIntervalApproximator( List<? extends CharSequence> delimiters, PrefixCoder prefixCoder ); private void safelyCloseDumpStream() { try { if ( this.dumpStream != null ) this.dumpStream.close(); } catch ( IOException ignore ) {} } private void ensureNotSelfContained() { if ( selfContained ) throw new IllegalStateException( "You cannot set the dump file of a self-contained external prefix dictionary" ); } private boolean isEncodable( final CharSequence s ) { for( int i = s.length(); i-- != 0; ) if ( ! char2symbol.containsKey( s.charAt( i ) ) ) return false; return true; } /** Sets the dump stream of this external prefix dictionary to a given filename. * * <P>This method sets the dump file used by this dictionary, and should be only * called after deserialisation, providing exactly the file generated at * creation time. Essentially anything can happen if you do not follow the rules. * * <P>Note that this method will attempt to close the old stream, if present. * * @param dumpStreamFilename the name of the dump file. * @see #setDumpStream(InputBitStream) */ public void setDumpStream( final CharSequence dumpStreamFilename ) throws FileNotFoundException{ ensureNotSelfContained(); safelyCloseDumpStream(); iteratorIsUsable = false; final long newLength = new File( dumpStreamFilename.toString() ).length(); if ( newLength != dumpStreamLength ) throw new IllegalArgumentException( "The size of the new dump file (" + newLength + ") does not match the original length (" + dumpStreamLength + ")" ); dumpStream = new InputBitStream( dumpStreamFilename.toString(), (int)( blockSize / 8 ) ); } /** Sets the dump stream of this external prefix dictionary to a given input bit stream. * * <P>This method sets the dump file used by this dictionary, and should be only * called after deserialisation, providing a repositionable stream containing * exactly the file generated at * creation time. Essentially anything can happen if you do not follow the rules. * * <P>Using this method you can load an external prefix dictionary in core memory, enjoying * the compactness of the data structure, but getting much more speed. * * <P>Note that this method will attemp to close the old stream, if present. * * @param dumpStream a repositionable input bit stream containing exactly the dump stream generated * at creation time. * @see #setDumpStream(CharSequence) */ public void setDumpStream( final InputBitStream dumpStream ) { ensureNotSelfContained(); safelyCloseDumpStream(); iteratorIsUsable = false; this.dumpStream = dumpStream; } private void ensureStream() { if ( dumpStream == null ) throw new IllegalStateException( "This external prefix dictionary has been deserialised, but no dump stream has been set" ); } public Interval getInterval( final CharSequence prefix ) { ensureStream(); // If prefix contains any character not coded by the prefix coder, we can return the empty interval. if ( ! isEncodable( prefix ) ) return Intervals.EMPTY_INTERVAL; // We recover the left extremes of the intervals where extensions of prefix could possibly lie. Interval interval = intervalApproximator.getApproximatedInterval( prefix ); //System.err.println( "Approximate interval: " + interval + " , terms: [" + blockStart[ interval.left ] + ", " + blockStart[ interval.right ] + "]" ); if ( interval == Intervals.EMPTY_INTERVAL ) return interval; try { dumpStream.position( blockOffset[ interval.left ] * blockSize ); dumpStream.readBits( 0 ); iteratorIsUsable = false; MutableString s = new MutableString(); int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ], start = -1, end = -1; /* We scan the dump file, stopping if we exhaust the block */ while( count < blockEnd ) { if ( prefixLength < 0 ) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete( prefixLength, s.length() ); s.length( prefixLength + suffixLength ); for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] ); if ( s.startsWith( prefix ) ) { start = count; break; } count++; } /* If we did not find our string, there are two possibilities: if the * interval contains one point, there is no string extending prefix. But * if the interval is larger, the first string of the second block in the * interval must be an extension of prefix. */ if ( start < 0 && interval.length() == 1 ) return Intervals.EMPTY_INTERVAL; else start = count; end = start + 1; //assert dumpStream.readBits() <= blockSize; /* If the interval contains more than one point, the last string with * given prefix is necessarily contained in the last block, and we * must restart the search process. */ if ( interval.length() > 1 ) { dumpStream.position( blockOffset[ interval.right ] * blockSize ); dumpStream.readBits( 0 ); s.length( 0 ); end = blockStart[ interval.right ];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -