📄 immutableexternalprefixdictionary.java
字号:
blockEnd = blockStart[ interval.right + 1 ]; prefixLength = -1; } while( end < blockEnd ) { if ( prefixLength < 0 ) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete( prefixLength, s.length() ); s.length( prefixLength + suffixLength ); for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] ); if ( ! s.startsWith( prefix ) ) break; end++; } return Interval.valueOf( start, end - 1 ); } catch (IOException rethrow ) { throw new RuntimeException( rethrow ); } } public MutableString getTerm( final int index, final MutableString s ) { ensureStream(); ensureRestrictedIndex( index ); // We perform a binary search to find the block to which s could possibly belong. int block = Arrays.binarySearch( blockStart, index ); if ( block < 0 ) block = - block - 2; try { dumpStream.position( blockOffset[ block ] * blockSize ); dumpStream.readBits( 0 ); iteratorIsUsable = false; int suffixLength, prefixLength = -1; for( int i = index - blockStart[ block ] + 1; i-- != 0; ) { if ( prefixLength < 0 ) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete( prefixLength, s.length() ); s.length( prefixLength + suffixLength ); for( int j = 0; j < suffixLength; j++ ) s.charAt( j + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] ); } return s; } catch( IOException rethrow ) { throw new RuntimeException( rethrow ); } } public CharSequence getTerm( final int index ) { return getTerm( index, new MutableString() ); } @Deprecated public int getIndex( final CharSequence s ) { return getNumber( s ); } public int getNumber( final CharSequence term ) { ensureStream(); // If term contains any character not coded by the prefix coder, we can return -1 if ( ! isEncodable( term ) ) return -1; /* If term is in the dictionary, any string extending term must follow term. Thus, * term can be in the dictionary only if it can be found in the left block * of an approximated interval for itself. */ Interval interval = intervalApproximator.getApproximatedInterval( term ); if ( interval == Intervals.EMPTY_INTERVAL ) return -1; try { dumpStream.position( blockOffset[ interval.left ] * blockSize ); dumpStream.readBits( 0 ); iteratorIsUsable = false; MutableString s = new MutableString(); int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ]; /* We scan the dump file, stopping if we exhaust the block */ while( count < blockEnd ) { if ( prefixLength < 0 ) prefixLength = 0; else prefixLength = dumpStream.readUnary(); suffixLength = dumpStream.readUnary(); s.delete( prefixLength, s.length() ); s.length( prefixLength + suffixLength ); for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] ); if ( s.equals( term ) ) { return count; } count++; } return -1; } catch (IOException rethrow ) { throw new RuntimeException( rethrow ); } } public int indexOf( final Object o ) { return getNumber( (CharSequence)o ); } public int lastIndexOf( final Object o) { return getNumber( (CharSequence)o ); } public CharSequence get( final int index ) { return getTerm( index ); } public boolean contains( final CharSequence term ) { return getNumber( term ) != -1; } public boolean hasPrefixes() { return true; } public boolean hasTerms() { return true; } /** An iterator over the dump stream. It does not use the interval approximator—it just scans the file. */ private final class DumpStreamIterator extends AbstractObjectIterator<CharSequence> { /** The current block being enumerated. */ private int currBlock = -1; /** The index of next term that will be returned. */ private int index; /** The mutable string used to return the result. */ final MutableString s = new MutableString(); private DumpStreamIterator() { try { dumpStream.position( 0 ); } catch ( IOException e ) { throw new RuntimeException( e ); } dumpStream.readBits( 0 ); iteratorIsUsable = true; } public boolean hasNext() { if ( ! iteratorIsUsable ) throw new IllegalStateException( "Get methods of this dictionary have caused a stream repositioning" ); return index < size; } public CharSequence next() { if ( ! hasNext() ) throw new NoSuchElementException(); try { final int prefixLength; if ( index == blockStart[ currBlock + 1 ] ) { if ( dumpStream.readBits() % blockSize != 0 ) dumpStream.skip( blockSize - dumpStream.readBits() % blockSize ); currBlock++; prefixLength = 0; } else prefixLength = dumpStream.readUnary(); final int suffixLength = dumpStream.readUnary(); s.delete( prefixLength, s.length() ); s.length( prefixLength + suffixLength ); for ( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] ); index++; return s; } catch ( IOException e ) { throw new RuntimeException( e ); } } } /** Returns an iterator over the dictionary. * * <P>The iterator returned by this method scans directly the dump stream. {@link List#listIterator()}, * instead, makes a call to {@link #get(int)} for every call to {@link java.util.ListIterator#hasNext() hasNext()} * and {@link java.util.ListIterator#hasPrevious() hasPrevious()}. * * <P>Note that the returned iterator uses <em>the same stream</em> as all get methods. Calling such methods while * the iterator is being used will produce an {@link IllegalStateException}. * * @return an iterator over the dictionary that just scans the dump stream. */ public ObjectIterator<CharSequence> iterator() { return new DumpStreamIterator(); } public int size() { return size; } public CharSequence getPrefix( final Interval interval ) { final MutableString s = new MutableString(); return getPrefix( interval, s ); } public MutableString getPrefix( final Interval interval, final MutableString prefix ) { if ( interval == Intervals.EMPTY_INTERVAL || interval.left < 0 || interval.right < 0 ) throw new IllegalArgumentException(); getTerm( interval.left, prefix ); if ( interval.length() == 1 ) return prefix; final MutableString s = (MutableString)getTerm( interval.right ); final int l = Math.min( prefix.length(), s.length() ); int i; for( i = 0; i < l; i++ ) if ( s.charAt( i ) != prefix.charAt( i ) ) break; return prefix.length( i ); } private void writeObject( final ObjectOutputStream s ) throws IOException { s.defaultWriteObject(); if ( selfContained ) { final FileInputStream fis = new FileInputStream( tempDumpStreamFilename ); IOUtils.copy( fis, s ); fis.close(); } } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); if ( selfContained ) { final File temp = File.createTempFile( this.getClass().getName(), ".dump" ); temp.deleteOnExit(); tempDumpStreamFilename = temp.toString(); // TODO: propose Jakarta CopyUtils extension with length control and refactor. FileOutputStream fos = new FileOutputStream( temp ); final byte[] b = new byte[ 64 * 1024 ]; int len; while( ( len = s.read( b ) ) >= 0 ) fos.write( b, 0, len ); fos.close(); dumpStream = new InputBitStream( temp, (int)( blockSize / 8 ) ); } } @SuppressWarnings("unchecked") public static void main( final String[] arg ) throws ClassNotFoundException, IOException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, JSAPException { final SimpleJSAP jsap = new SimpleJSAP( ImmutableExternalPrefixDictionary.class.getName(), "Builds an external dictionary reading from standard input a newline-separated list of terms or a serialised term list. If the dump stream name is not specified, the dictionary will be self-contained.", new Parameter[] { new FlaggedOption( "blockSize", JSAP.INTSIZE_PARSER, ( STD_BLOCK_SIZE / 1024 ) + "Ki", JSAP.NOT_REQUIRED, 'b', "block-size", "The size of a block in the dump stream." ), new Switch( "serialised", 's', "serialised", "The data source (file or standard input) provides a serialised java.util.List of terms." ), new Switch( "zipped", 'z', "zipped", "Standard input is compressed in gzip format." ), new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file instead of standard input." ), new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term list encoding." ), new FlaggedOption( "class", MG4JClassParser.getParser(), ImmutableExternalTriePrefixDictionary.class.getName(), JSAP.NOT_REQUIRED, 'c', "class", "The class used to build the dictionary." ), new UnflaggedOption( "dictionary", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised dictionary." ), new UnflaggedOption( "dump", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "An optional dump stream (the resulting dictionary will not be self-contained)." ) } ); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; Collection<? extends CharSequence> termList; final String termFile = jsapResult.getString( "termFile" ); final Charset encoding = (Charset)jsapResult.getObject( "encoding" ); final boolean zipped = jsapResult.getBoolean( "zipped" ); final boolean serialised = jsapResult.getBoolean( "serialised" ); if ( zipped && serialised ) throw new IllegalArgumentException( "The zipped and serialised options are incompatible" ); if ( serialised ) termList = (List<? extends CharSequence>) ( termFile != null ? BinIO.loadObject( termFile ) : BinIO.loadObject( System.in ) ); else { if ( termFile != null ) termList = new FileLinesCollection( termFile, encoding.name(), zipped ); else { final ObjectArrayList<MutableString> list = new ObjectArrayList<MutableString>(); termList = list; final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( zipped ? new GZIPInputStream( System.in ) : System.in, encoding.name() ) ); final MutableString term = new MutableString(); while( terms.readLine( term ) != null ) list.add( term.copy() ); terms.close(); } } BinIO.storeObject( jsapResult.getClass( "class" ).getConstructor( Iterable.class, int.class, CharSequence.class ).newInstance( termList, new Integer( jsapResult.getInt( "blockSize" ) ), jsapResult.getString( "dump" ) ), jsapResult.getString( "dictionary" ) ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -