📄 immutableexternalprefixdictionary.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
				blockEnd = blockStart[ interval.right + 1 ];				prefixLength = -1;			}									while( end < blockEnd ) {				if ( prefixLength < 0 ) prefixLength = 0;				else prefixLength = dumpStream.readUnary();				suffixLength = dumpStream.readUnary();				s.delete( prefixLength, s.length() );				s.length( prefixLength + suffixLength );				for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );				if ( ! s.startsWith( prefix ) ) break;				end++;			}						return Interval.valueOf( start, end - 1 );		} catch (IOException rethrow ) {			throw new RuntimeException( rethrow );		}			}			public MutableString getTerm( final int index, final MutableString s ) {		ensureStream();		ensureRestrictedIndex( index );		// We perform a binary search to find the  block to which s could possibly belong.		int block = Arrays.binarySearch( blockStart, index );		if ( block < 0 ) block = - block - 2;		try {			dumpStream.position( blockOffset[ block ] * blockSize );			dumpStream.readBits( 0 );			iteratorIsUsable = false;			int suffixLength, prefixLength = -1;			for( int i = index - blockStart[ block ] + 1; i-- != 0; ) { 				if ( prefixLength < 0 ) prefixLength = 0;				else prefixLength = dumpStream.readUnary();				suffixLength = dumpStream.readUnary();				s.delete( prefixLength, s.length() );				s.length( prefixLength + suffixLength );				for( int j = 0; j < suffixLength; j++ ) s.charAt( j + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );			}						return s;		}		catch( IOException rethrow ) {			throw new RuntimeException( rethrow );		}	}	public CharSequence getTerm( final int index ) {		return getTerm( index, new MutableString() );	}		@Deprecated	public int getIndex( final CharSequence s ) {		return getNumber( s );	}		public int getNumber( final CharSequence term ) {		ensureStream();		// If term contains any character not coded by the prefix coder, we can return -1		if ( ! isEncodable( term ) ) return -1;		/* If term is in the dictionary, any string extending term must follow term. Thus,		 * term can be in the dictionary only if it can be found in the left block		 * of an approximated interval for itself. */		Interval interval = intervalApproximator.getApproximatedInterval( term );		if ( interval == Intervals.EMPTY_INTERVAL ) return -1;		try {			dumpStream.position( blockOffset[ interval.left ] * blockSize );			dumpStream.readBits( 0 );			iteratorIsUsable = false;			MutableString s = new MutableString();			int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ];			/* We scan the dump file, stopping if we exhaust the block */			while( count < blockEnd ) {				if ( prefixLength < 0 ) prefixLength = 0;				else prefixLength = dumpStream.readUnary();				suffixLength = dumpStream.readUnary();				s.delete( prefixLength, s.length() );				s.length( prefixLength + suffixLength );				for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );				if ( s.equals( term ) ) {					return count;				}				count++;			}						return -1;		}		catch (IOException rethrow ) {			throw new RuntimeException( rethrow );		}	}		public int indexOf( final Object o ) {		return getNumber( (CharSequence)o );	}	public int lastIndexOf( final Object o) {		return getNumber( (CharSequence)o );	}	public CharSequence get( final int index ) {		return getTerm( index );	}	public boolean contains( final CharSequence term ) {		return getNumber( term ) != -1;	}	public boolean hasPrefixes() {		return true;	}	public boolean hasTerms() {		return true;	}	/** An iterator over the dump stream. It does not use the interval approximator&mdash;it just scans the file. */		private final class DumpStreamIterator extends AbstractObjectIterator<CharSequence> {		/** The current block being enumerated. */		private int currBlock = -1;		/** The index of next term that will be returned. */		private int index;		/** The mutable string used to return the result. */		final MutableString s = new MutableString();		private DumpStreamIterator() {			try {				dumpStream.position( 0 );			}			catch ( IOException e ) {				throw new RuntimeException( e );			}			dumpStream.readBits( 0 );			iteratorIsUsable = true;		}				public boolean hasNext() {			if ( ! iteratorIsUsable ) throw new IllegalStateException( "Get methods of this dictionary have caused a stream repositioning" );			return index < size;		}		public CharSequence next() {			if ( ! hasNext() ) throw new NoSuchElementException();			try {				final int prefixLength;				if ( index == blockStart[ currBlock + 1 ] ) {					if ( dumpStream.readBits() % blockSize != 0 ) dumpStream.skip( blockSize - dumpStream.readBits() % blockSize );					currBlock++;					prefixLength = 0;				}				else prefixLength = dumpStream.readUnary();				final int suffixLength = dumpStream.readUnary();				s.delete( prefixLength, s.length() );				s.length( prefixLength + suffixLength );				for ( int i = 0; i < suffixLength; i++ )					s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );				index++;				return s;			}			catch ( IOException e ) {				throw new RuntimeException( e );			}		}	}		/** Returns an iterator over the dictionary.	 * 	 * <P>The iterator returned by this method scans directly the dump stream. {@link List#listIterator()},	 * instead, makes a call to {@link #get(int)} for every call to {@link java.util.ListIterator#hasNext() hasNext()}	 * and {@link java.util.ListIterator#hasPrevious() hasPrevious()}.	 * 	 * <P>Note that the returned iterator uses <em>the same stream</em> as all get methods. Calling such methods while	 * the iterator is being used will produce an {@link IllegalStateException}.	 * 	 * @return an iterator over the dictionary that just scans the dump stream.	 */		public ObjectIterator<CharSequence> iterator() {		return new DumpStreamIterator();	}		public int size() {		return size;	}		public CharSequence getPrefix( final Interval interval ) {		final MutableString s = new MutableString();		return getPrefix( interval, s );	}		public MutableString getPrefix( final Interval interval, final MutableString prefix ) {		if ( interval == Intervals.EMPTY_INTERVAL || interval.left < 0 || interval.right < 0 ) throw new IllegalArgumentException();		getTerm( interval.left, prefix );		if ( interval.length() == 1 ) return prefix;		final MutableString s = (MutableString)getTerm( interval.right );		final int l = Math.min( prefix.length(), s.length() );		int i;		for( i = 0; i < l; i++ ) if ( s.charAt( i ) != prefix.charAt( i ) ) break;		return prefix.length( i );	}		private void writeObject( final ObjectOutputStream s ) throws IOException {		s.defaultWriteObject();		if ( selfContained ) {			final FileInputStream fis = new FileInputStream( tempDumpStreamFilename );			IOUtils.copy( fis, s );			fis.close();		}	}	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {		s.defaultReadObject();		if ( selfContained ) {			final File temp = File.createTempFile( this.getClass().getName(), ".dump" );			temp.deleteOnExit();			tempDumpStreamFilename = temp.toString();			// TODO: propose Jakarta CopyUtils extension with length control and refactor.			FileOutputStream fos = new FileOutputStream( temp );			final byte[] b = new byte[ 64 * 1024 ];			int len;			while( ( len = s.read( b ) ) >= 0 ) fos.write( b, 0, len );			fos.close();			dumpStream = new InputBitStream( temp, (int)( blockSize / 8 ) );		}	}	@SuppressWarnings("unchecked")	public static void main( final String[] arg ) throws ClassNotFoundException, IOException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, JSAPException {		final SimpleJSAP jsap = new SimpleJSAP( ImmutableExternalPrefixDictionary.class.getName(), "Builds an external dictionary reading from standard input a newline-separated list of terms or a serialised term list. If the dump stream name is not specified, the dictionary will be self-contained.", 				new Parameter[] {					new FlaggedOption( "blockSize", JSAP.INTSIZE_PARSER, ( STD_BLOCK_SIZE / 1024 ) + "Ki", JSAP.NOT_REQUIRED, 'b', "block-size", "The size of a block in the dump stream." ),					new Switch( "serialised", 's', "serialised", "The data source (file or standard input) provides a serialised java.util.List of terms." ),					new Switch( "zipped", 'z', "zipped", "Standard input is compressed in gzip format." ),					new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file instead of standard input." ),										new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term list encoding." ),					new FlaggedOption( "class", MG4JClassParser.getParser(), ImmutableExternalTriePrefixDictionary.class.getName(), JSAP.NOT_REQUIRED, 'c', "class", "The class used to build the dictionary." ),					new UnflaggedOption( "dictionary", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised dictionary." ),					new UnflaggedOption( "dump", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "An optional dump stream (the resulting dictionary will not be self-contained)." )			}		);		JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;				Collection<? extends CharSequence> termList;				final String termFile = jsapResult.getString( "termFile" );		final Charset encoding = (Charset)jsapResult.getObject( "encoding" );		final boolean zipped = jsapResult.getBoolean( "zipped" );		final boolean serialised = jsapResult.getBoolean( "serialised" );		if ( zipped && serialised ) throw new IllegalArgumentException( "The zipped and serialised options are incompatible" );		if ( serialised ) termList = (List<? extends CharSequence>) ( termFile != null ? BinIO.loadObject( termFile ) : BinIO.loadObject( System.in ) );		else {			if ( termFile != null ) termList = new FileLinesCollection( termFile, encoding.name(), zipped );			else {				final ObjectArrayList<MutableString> list = new ObjectArrayList<MutableString>();				termList = list;				final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( 						zipped ? new GZIPInputStream( System.in ) : System.in, encoding.name() ) );				final MutableString term = new MutableString();				while( terms.readLine( term ) != null ) list.add( term.copy() );				terms.close();			}		}		BinIO.storeObject( jsapResult.getClass( "class" ).getConstructor( Iterable.class, int.class, CharSequence.class ).newInstance(				termList, new Integer( jsapResult.getInt( "blockSize" ) ), jsapResult.getString( "dump" ) ), jsapResult.getString( "dictionary" ) );	}}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -