📄 immutableexternalprefixdictionary.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
		final PrefixCodec codec;		final BitVector[] codeWord;		if ( packedFrequency.length != 0 ) {			codec = getPrefixCodec( packedFrequency );			prefixCoder = codec.coder();			decoder = codec.decoder();			codeWord = prefixCoder.codeWords();		}		else {			// This handles the case of a collection without words			codec = null;			prefixCoder = null;			decoder = null;			codeWord = null;		}				packedFrequency = frequency = null;		// We now compress all strings using the given codec mixed with front coding		final OutputBitStream output;		if ( selfContained ) {			final File temp = File.createTempFile( this.getClass().getName(), ".dump" );			temp.deleteOnExit();			tempDumpStreamFilename = temp.toString();			output = new OutputBitStream( temp, blockSizeInBytes );		}		else output = new OutputBitStream( tempDumpStreamFilename = dumpStreamFilename.toString(), blockSizeInBytes );				// This array will contain the delimiting words (the ones at the start of each block)		boolean isDelimiter;				int length, prevTermLength = 0, bits;		int prefixLength = 0, termCount = 0;		int currBuffer = 0;				final IntArrayList blockStarts = new IntArrayList();		final IntArrayList blockOffsets = new IntArrayList();		final ObjectArrayList<MutableString> delimiters = new ObjectArrayList<MutableString>();		prevTerm.length( 0 );				for( Iterator<?> i = terms.iterator(); i.hasNext(); ) {			s = (CharSequence) i.next();			length = s.length();			isDelimiter = false;						// We compute the common prefix and the number of bits that are necessary to code the next term.			bits = 0;			for( prefixLength = 0; prefixLength < length && prefixLength < prevTermLength && prevTerm.charAt( prefixLength ) == s.charAt( prefixLength ); prefixLength++ );			for( int j = prefixLength; j < length; j++ ) bits += codeWord[ char2symbol.get( s.charAt( j ) ) ].size();						//if ( bits + length + 1 > blockSize ) throw new IllegalArgumentException( "The string \"" + s + "\" is too long to be encoded with block size " + blockSizeInBytes );						// If the next term would overflow the block, and we are not at the start of a block, we align.			if ( output.writtenBits() % blockSize != 0 && output.writtenBits() / blockSize != ( output.writtenBits() + ( length - prefixLength + 1 ) + ( prefixLength + 1 ) + bits - 1 ) / blockSize ) {				// We align by writing 0es.				if ( DEBUG ) System.err.println( "Aligning away " + ( blockSize - output.writtenBits() % blockSize ) + " bits..." );				for( int j = (int)( blockSize - output.writtenBits() % blockSize ); j-- != 0; ) output.writeBit( 0 );				if ( ASSERTS ) assert output.writtenBits() % blockSize == 0;			}			if ( output.writtenBits() % blockSize == 0 ) {				isDelimiter = true;				prefixLength = 0;				blockOffsets.add( (int)( output.writtenBits() / blockSize ) );			}						// Note that delimiters do not get the prefix length, as it's 0.			if ( ! isDelimiter ) output.writeUnary( prefixLength );			output.writeUnary( length - prefixLength );			// Write the next coded suffix on output.			for( int j = prefixLength; j < length; j++ ) {				BitVector c = codeWord[ char2symbol.get( s.charAt( j ) ) ];				for( int k = 0; k < c.size(); k++ ) output.writeBit( c.get( k ) );			}						if ( isDelimiter ) {				if ( DEBUG ) System.err.println( "First string of block " + blockStarts.size() + ": " + termCount + " (" + s + ")" );				// The current word starts a new block				blockStarts.add( termCount );				// We do not want to rely on s being immutable.				delimiters.add( new MutableString( s ) );			}						currBuffer = 1 - currBuffer;			prevTerm.replace( s );			prevTermLength = length;			termCount++;		}				output.align();		dumpStreamLength = output.writtenBits() / 8;		output.close();				intervalApproximator = getIntervalApproximator( delimiters, prefixCoder );		blockStarts.add( size );		blockStart = blockStarts.toIntArray();		blockOffset = blockOffsets.toIntArray();				// We use a buffer of the same size of a block, hoping in fast I/O. */		dumpStream = new InputBitStream( tempDumpStreamFilename, blockSizeInBytes );	}	/** Creates an external dictionary with block size {@link #STD_BLOCK_SIZE} and specified dump stream.	 * 	 * <P>This constructor does not assume that strings returned by <code>terms.iterator()</code>	 * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.	 * 	 * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the dictionary.	 * @param dumpStreamFilename the name of the dump stream, or <code>null</code> for a dictionary	 * with an automatic dump stream.	 */		public ImmutableExternalPrefixDictionary( final Iterable<? extends CharSequence> terms, final CharSequence dumpStreamFilename ) throws IOException {		this( terms, STD_BLOCK_SIZE, dumpStreamFilename );	}	/** Creates an external dictionary with specified block size.	 * 	 * <P>This constructor does not assume that strings returned by <code>terms.iterator()</code>	 * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.	 * 	 * @param blockSizeInBytes the block size (in bytes).	 * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the dictionary.	 */		public ImmutableExternalPrefixDictionary( final Iterable<? extends CharSequence> terms, final int blockSizeInBytes ) throws IOException {		this( terms, blockSizeInBytes, null );	}		/** Creates an external prefix dictionary with block size {@link #STD_BLOCK_SIZE}.	 * 	 * <P>This constructor does not assume that strings returned by <code>terms.iterator()</code>	 * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.	 * 	 * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the dictionary.	 */		public ImmutableExternalPrefixDictionary( final Iterable<? extends CharSequence> terms ) throws IOException {		this( terms, null );	}	/** An abstract factory method returning a prefix codec that will be used to compress the dump file.	 * 	 * <P>Implementing subclasses must provide a codec. The coder will be used	 * at construction time, and will be passed to {@link #getIntervalApproximator(List, PrefixCoder)},	 * whereas the decoder will be available in {@link #decoder}.	 * 	 * @param frequency the symbol frequencies.	 * @return the prefix codec that will be used to compress the dump file.	 */	protected abstract PrefixCodec getPrefixCodec( final int[] frequency );		/** An abstract factory method returning an interval approximator for this external dictionary.	 * 	 * @param delimiters a list of the words appearing at the start of each block.	 * @param prefixCoder the prefix coder provided by the prefix coded previously returned by {@link #getPrefixCodec(int[])};	 * the returned approximator can use its coder/decoder.	 * @return an interval approximator for this external dictionary.	 */	protected abstract IntervalApproximator getIntervalApproximator( List<? extends CharSequence> delimiters, PrefixCoder prefixCoder );			private void safelyCloseDumpStream() {		try {			if ( this.dumpStream != null ) this.dumpStream.close();		} 		catch ( IOException ignore ) {}	}		private void ensureNotSelfContained() {		if ( selfContained ) throw new IllegalStateException( "You cannot set the dump file of a self-contained external prefix dictionary" );	}		private boolean isEncodable( final CharSequence s ) {		for( int i = s.length(); i-- != 0; ) if ( ! char2symbol.containsKey( s.charAt( i ) ) ) return false;		return true;	}			/** Sets the dump stream of this external prefix dictionary to a given filename.	 *	 * <P>This method sets the dump file used by this dictionary, and should be only	 * called after deserialisation, providing exactly the file generated at	 * creation time. Essentially anything can happen if you do not follow the rules.	 *	 * <P>Note that this method will attempt to close the old stream, if present.	 *   	 * @param dumpStreamFilename the name of the dump file.	 * @see #setDumpStream(InputBitStream)	 */		public void setDumpStream( final CharSequence dumpStreamFilename ) throws FileNotFoundException{		ensureNotSelfContained();		safelyCloseDumpStream();		iteratorIsUsable = false;		final long newLength = new File( dumpStreamFilename.toString() ).length();		if ( newLength != dumpStreamLength )			throw new IllegalArgumentException( "The size of the new dump file (" + newLength + ") does not match the original length (" + dumpStreamLength + ")" );		dumpStream = new InputBitStream( dumpStreamFilename.toString(), (int)( blockSize / 8 ) );	}		/** Sets the dump stream of this external prefix dictionary to a given input bit stream.	 *	 * <P>This method sets the dump file used by this dictionary, and should be only	 * called after deserialisation, providing a repositionable stream containing	 * exactly the file generated at	 * creation time. Essentially anything can happen if you do not follow the rules.	 *  	 * <P>Using this method you can load an external prefix dictionary in core memory, enjoying	 * the compactness of the data structure, but getting much more speed. 	 * 	 * <P>Note that this method will attemp to close the old stream, if present.	 *   	 * @param dumpStream a repositionable input bit stream containing exactly the dump stream generated	 * at creation time.	 * @see #setDumpStream(CharSequence)	 */	public void setDumpStream( final InputBitStream dumpStream ) {		ensureNotSelfContained();		safelyCloseDumpStream();		iteratorIsUsable = false;		this.dumpStream = dumpStream;	}	private void ensureStream() {		if ( dumpStream == null ) throw new IllegalStateException( "This external prefix dictionary has been deserialised, but no dump stream has been set" );	}		public Interval getInterval( final CharSequence prefix ) {		ensureStream();		// If prefix contains any character not coded by the prefix coder, we can return the empty interval.		if ( ! isEncodable( prefix ) ) return Intervals.EMPTY_INTERVAL;		// We recover the left extremes of the intervals where extensions of prefix could possibly lie.		Interval interval = intervalApproximator.getApproximatedInterval( prefix );		//System.err.println( "Approximate interval: " + interval + " , terms: [" + blockStart[ interval.left ] + ", " + blockStart[ interval.right ] + "]" );		if ( interval == Intervals.EMPTY_INTERVAL ) return interval;		try {			dumpStream.position( blockOffset[ interval.left ] * blockSize );			dumpStream.readBits( 0 );			iteratorIsUsable = false;			MutableString s = new MutableString();			int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ], start = -1, end = -1;			/* We scan the dump file, stopping if we exhaust the block */			while( count < blockEnd ) {				if ( prefixLength < 0 ) prefixLength = 0;				else prefixLength = dumpStream.readUnary();				suffixLength = dumpStream.readUnary();				s.delete( prefixLength, s.length() );				s.length( prefixLength + suffixLength );				for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );				if ( s.startsWith( prefix ) ) {					start = count;					break; 				}				count++;			}						/* If we did not find our string, there are two possibilities: if the			 * interval contains one point, there is no string extending prefix. But			 * if  the interval  is larger, the first string of the second block in the			 * interval must be an extension of prefix. */			if ( start < 0 && interval.length() == 1 ) return Intervals.EMPTY_INTERVAL;			else start = count;						end = start + 1;			//assert dumpStream.readBits() <= blockSize;			/* If the interval contains more than one point, the last string with			 * given prefix is necessarily contained in the last block, and we			 * must restart the search process. */			if ( interval.length() > 1  ) {				dumpStream.position( blockOffset[ interval.right ] * blockSize );				dumpStream.readBits( 0 );				s.length( 0 );				end = blockStart[ interval.right ];
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -