📄 partitionlexically.java
字号:
termNumber++; } terms.close(); for( int i = 0; i < numIndices; i++ ) localTerms[ i ].close(); pl.done(); } public void run() throws ConfigurationException, IOException, ClassNotFoundException { final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval ); final byte[] buffer = new byte[ bufferSize ]; final OutputBitStream[] localIndexStream = new OutputBitStream[ numIndices ]; final OutputBitStream[] localPositionsStream = new OutputBitStream[ numIndices ]; final OutputBitStream[] localOffsets = new OutputBitStream[ numIndices ]; final OutputBitStream[] localFrequencies = new OutputBitStream[ numIndices ]; final OutputBitStream[] localGlobCounts = new OutputBitStream[ numIndices ]; final PrintWriter[] localTerms = new PrintWriter[ numIndices ]; final int numTerms[] = new int[ numIndices ]; final long numberOfOccurrences[] = new long[ numIndices ]; final long numberOfPostings[] = new long[ numIndices ]; final boolean isHighPerformance = BitStreamHPIndex.class.isAssignableFrom( Class.forName( new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).getString( Index.PropertyKeys.INDEXCLASS ) ) ); final InputBitStream globalIndex = new InputBitStream( inputBasename + DiskBasedIndex.INDEX_EXTENSION, bufferSize ); final InputBitStream globalPositions = isHighPerformance ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize ) : null; final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ); final InputBitStream offsets = new InputBitStream( inputBasename + DiskBasedIndex.OFFSETS_EXTENSION ); final InputBitStream frequencies = new InputBitStream( inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION ); final InputBitStream globCounts = new InputBitStream( inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ); offsets.readGamma(); for( int i = 0; i < numIndices; i++ ) { localIndexStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.INDEX_EXTENSION, bufferSize ); if ( isHighPerformance ) localPositionsStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize ); localFrequencies[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.FREQUENCIES_EXTENSION ); localGlobCounts[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION ); localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) ); localOffsets[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION ); localOffsets[ i ].writeGamma( 0 ); } // The current term final MutableString currTerm = new MutableString(); pl.expectedUpdates = ( new File( inputBasename + DiskBasedIndex.INDEX_EXTENSION ).length() + ( isHighPerformance ? new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length() : 0 ) ) * 8; pl.itemsName = "bits"; pl.logInterval = logInterval; pl.start( "Partitioning index..." ); int termNumber = 0, k, prevK = -1, previousHeaderLength = 0, newHeaderLength = 0; long length, count, positionsOffset = 0; int res, frequency; while( terms.readLine( currTerm ) != null ) { k = strategy.localIndex( termNumber ); // The local index for this term if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException(); numTerms[ k ]++; if ( isHighPerformance ) { final long temp = globalIndex.readBits(); positionsOffset = globalIndex.readLongDelta(); previousHeaderLength = (int)( globalIndex.readBits() - temp ); if ( prevK != -1 ) { length = positionsOffset - globalPositions.readBits(); pl.count += length; while( length > 0 ) { res = (int)Math.min( bufferSize * 8, length ); globalPositions.read( buffer, res ); localPositionsStream[ prevK ].write( buffer, res ); length -= res; } } newHeaderLength = localIndexStream[ k ].writeLongDelta( localPositionsStream[ k ].writtenBits() ); } frequency = frequencies.readGamma(); localFrequencies[ k ].writeGamma( frequency ); numberOfPostings[ k ] += frequency; count = globCounts.readLongGamma(); numberOfOccurrences[ k ] += count; localGlobCounts[ k ].writeLongGamma( count ); currTerm.println( localTerms[ k ] ); length = offsets.readLongGamma() - previousHeaderLength; localOffsets[ k ].writeLongGamma( length + newHeaderLength ); pl.count += length + previousHeaderLength - 1; while( length > 0 ) { res = (int)Math.min( bufferSize * 8, length ); globalIndex.read( buffer, res ); localIndexStream[ k ].write( buffer, res ); length -= res; } pl.update(); prevK = k; termNumber++; } // We pour the last piece of positions if ( isHighPerformance ) { if ( prevK != -1 ) { length = positionsOffset - globalPositions.readBits(); while( length > 0 ) { res = (int)Math.min( bufferSize * 8, length ); globalIndex.read( buffer, res ); localPositionsStream[ prevK ].write( buffer, res ); length -= res; } } } pl.done(); terms.close(); offsets.close(); frequencies.close(); globCounts.close(); globalIndex.close(); if ( isHighPerformance ) globalPositions.close(); // We copy the relevant properties from the original Properties properties = new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); Properties globalProperties = new Properties(); if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename ); globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, false ); globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS, LexicalCluster.class.getName() ); for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] ); globalProperties.setProperty( Index.PropertyKeys.FIELD, properties.getProperty( Index.PropertyKeys.FIELD ) ); globalProperties.setProperty( Index.PropertyKeys.POSTINGS, properties.getProperty( Index.PropertyKeys.POSTINGS ) ); globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, properties.getProperty( Index.PropertyKeys.OCCURRENCES ) ); globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, properties.getProperty( Index.PropertyKeys.DOCUMENTS ) ); globalProperties.setProperty( Index.PropertyKeys.TERMS, properties.getProperty( Index.PropertyKeys.TERMS ) ); globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, properties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) ); globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, properties.getProperty( Index.PropertyKeys.MAXCOUNT ) ); globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, properties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) ); globalProperties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); LOGGER.debug( "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap( globalProperties ) ); for( int i = 0; i < numIndices; i++ ) { localIndexStream[ i ].close(); if ( isHighPerformance ) localPositionsStream[ i ].close(); localOffsets[ i ].close(); localFrequencies[ i ].close(); localGlobCounts[ i ].close(); localTerms[ i ].close(); final InputStream input = new FileInputStream( inputBasename + DiskBasedIndex.SIZES_EXTENSION ); final OutputStream output = new FileOutputStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION ); IOUtils.copy( input, output ); input.close(); output.close(); Properties localProperties = new Properties(); localProperties.addAll( globalProperties ); localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] ); localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, numberOfOccurrences[ i ] ); localProperties.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings[ i ] ); localProperties.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings[ i ] ); localProperties.setProperty( Index.PropertyKeys.INDEXCLASS, properties.getProperty( Index.PropertyKeys.INDEXCLASS ) ); localProperties.addProperties( Index.PropertyKeys.CODING, properties.getStringArray( Index.PropertyKeys.CODING ) ); localProperties.setProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM, properties.getProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM ) ); localProperties.setProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT, properties.getProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT ) ); if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] ); localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION ); LOGGER.debug( "Post-partitioning properties for index " + localBasename[ i ] + ": " + new ConfigurationMap( localProperties ) ); } } public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException { SimpleJSAP jsap = new SimpleJSAP( PartitionLexically.class.getName(), "Partitions an index lexically.", new Parameter[] { new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ), new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ), new FlaggedOption( "strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "strategy", "A serialised lexical partitioning strategy." ), new FlaggedOption( "uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uniform", "Requires a uniform partitioning in the given number of parts." ), new Switch( "termsOnly", 't', "terms-only", "Just partition the term list." ), new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ), new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; String inputBasename = jsapResult.getString( "inputBasename" ); String outputBasename = jsapResult.getString( "outputBasename" ); String strategyFilename = jsapResult.getString( "strategy" ); LexicalPartitioningStrategy strategy = null; if ( jsapResult.userSpecified( "uniformStrategy" ) ) { strategy = LexicalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), DiskBasedIndex.getInstance( inputBasename, false, false, true ) ); BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION ); } else if ( strategyFilename != null ) strategy = (LexicalPartitioningStrategy)BinIO.loadObject( strategyFilename ); else throw new IllegalArgumentException( "You must specify a splitting strategy" ); final PartitionLexically partitionLexically = new PartitionLexically( inputBasename, outputBasename, strategy, strategyFilename, jsapResult.getInt( "bufferSize" ), jsapResult.getLong( "logInterval" ) ); if ( jsapResult.getBoolean( "termsOnly" ) ) partitionLexically.runTermsOnly(); else partitionLexically.run(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -