📄 partitiondocumentally.java
字号:
// ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents. int size, localIndex; if ( globalIndex.numberOfDocuments == strategy.numberOfDocuments( 0 ) ) { for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) { localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() ); if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size; for( int l = numIndices; l-- != 0; ) if ( l != localIndex ) localSizes[ l ].writeGamma( 0 ); } } else { for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) { localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() ); if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size; } } sizes.close(); for ( int i = 0; i < numIndices; i++ ) localSizes[ i ].close(); } } public void run() throws Exception { final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval ); final IntList sizeList = globalIndex.sizes; partitionSizes(); final int[] position = new int[ globalIndex.maxCount ]; final int[] localFrequency = new int[ numIndices ]; final int[] usedIndex = new int[ numIndices ]; final InputBitStream[] direct = new InputBitStream[ numIndices ]; final InputBitStream[] indirect = new InputBitStream[ numIndices ]; final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[ numIndices ] : null; final File[] tempFile = new File[ numIndices ]; final CachingOutputBitStream[] temp = new CachingOutputBitStream[ numIndices ]; IndexIterator indexIterator; for ( int i = 0; i < numIndices; i++ ) { tempFile[ i ] = new File( localBasename[ i ] + ".temp" ); temp[ i ] = new CachingOutputBitStream( tempFile[ i ], bufferSize ); direct[ i ] = new InputBitStream( temp[ i ].buffer() ); indirect[ i ] = new InputBitStream( tempFile[ i ] ); if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision ); } int usedIndices; MutableString currentTerm = new MutableString(); Payload payload = null; int frequency, globalPointer, localIndex, localPointer, count = -1; pl.expectedUpdates = globalIndex.numberOfPostings; pl.itemsName = "postings"; pl.logInterval = logInterval; pl.start( "Partitioning index..." ); for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) { terms.readLine( currentTerm ); indexIterator = indexReader.nextIterator(); usedIndices = 0; frequency = indexIterator.frequency(); for ( int j = 0; j < frequency; j++ ) { globalPointer = indexIterator.nextDocument(); localIndex = strategy.localIndex( globalPointer ); if ( localFrequency[ localIndex ] == 0 ) { // First time we see a document for this index. currentTerm.println( localTerms[ localIndex ] ); numTerms[ localIndex ]++; usedIndex[ usedIndices++ ] = localIndex; if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm ); } /* Store temporarily posting data; note that we save the global pointer as we * will have to access the size list. */ localFrequency[ localIndex ]++; numPostings[ localIndex ]++; temp[ localIndex ].writeGamma( globalPointer ); if ( globalIndex.hasPayloads ) payload = indexIterator.payload(); if ( havePayloads ) payload.write( temp[ localIndex ] ); if ( haveCounts ) { count = indexIterator.count(); temp[ localIndex ].writeGamma( count ); globCount[ localIndex ] += count; if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count; if ( havePositions ) { final int[] pos = indexIterator.positionArray(); // TODO: compress this stuff for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] ); } } } // We now run through the indices used by this term and copy from the temporary buffer. OutputBitStream obs; for( int k = 0; k < usedIndices; k++ ) { final int i = usedIndex[ k ]; localFrequencies[ i ].writeGamma( localFrequency[ i ] ); if ( haveCounts ) numOccurrences[ i ] += globCount[ i ]; if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] ); globCount[ i ] = 0; InputBitStream ibs; indexWriter[ i ].newInvertedList(); temp[ i ].align(); if ( temp[ i ].buffer() != null ) ibs = direct[ i ]; else { // We cannot read directly from the internal buffer. ibs = indirect[ i ]; ibs.flush(); temp[ i ].flush(); } ibs.position( 0 ); indexWriter[ i ].writeFrequency( localFrequency[ i ] ); for( int j = 0; j < localFrequency[ i ]; j++ ) { obs = indexWriter[ i ].newDocumentRecord(); globalPointer = ibs.readGamma(); localPointer = strategy.localPointer( globalPointer ); indexWriter[ i ].writeDocumentPointer( obs, localPointer ); if ( havePayloads ) { payload.read( ibs ); indexWriter[ i ].writePayload( obs, payload ); } if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() ); if ( havePositions ) { for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma(); indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 ); } } temp[ i ].position( 0 ); temp[ i ].writtenBits( 0 ); localFrequency[ i ] = 0; } usedIndices = 0; pl.count += frequency - 1; pl.update(); } pl.done(); Properties globalProperties = new Properties(); globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) ); globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) ); for ( int i = 0; i < numIndices; i++ ) { localFrequencies[ i ].close(); if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].close(); localTerms[ i ].close(); indexWriter[ i ].close(); if ( bloomFilterPrecision != 0 ) BinIO.storeObject( bloomFilter[ i ], localBasename[ i ] + DocumentalCluster.BLOOM_EXTENSION ); temp[ i ].close(); tempFile[ i ].delete(); Properties localProperties = indexWriter[ i ].properties(); localProperties.addAll( globalProperties ); localProperties.setProperty( Index.PropertyKeys.MAXCOUNT, String.valueOf( maxDocPos[ i ] ) ); localProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize[ i ] ); localProperties.setProperty( Index.PropertyKeys.FIELD, globalProperties.getProperty( Index.PropertyKeys.FIELD ) ); localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[ i ] : -1 ); localProperties.setProperty( Index.PropertyKeys.POSTINGS, numPostings[ i ] ); localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] ); if ( havePayloads ) localProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() ); if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] ); localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION ); } if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename ); for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] ); globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0 ); // If we partition an index with a single term, by definition we have a flat cluster globalProperties.setProperty( DocumentalCluster.PropertyKeys.FLAT, inputProperties.getInt( Index.PropertyKeys.TERMS ) <= 1 ); globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, inputProperties.getProperty( Index.PropertyKeys.MAXCOUNT ) ); globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, inputProperties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) ); globalProperties.setProperty( Index.PropertyKeys.POSTINGS, inputProperties.getProperty( Index.PropertyKeys.POSTINGS ) ); globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, inputProperties.getProperty( Index.PropertyKeys.OCCURRENCES ) ); globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, inputProperties.getProperty( Index.PropertyKeys.DOCUMENTS ) ); globalProperties.setProperty( Index.PropertyKeys.TERMS, inputProperties.getProperty( Index.PropertyKeys.TERMS ) ); if ( havePayloads ) globalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() ); /* For the general case, we must rely on a merged cluster. However, if we detect a contiguous * strategy we can optimise a bit. */ globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS, strategy instanceof ContiguousDocumentalStrategy ? DocumentalConcatenatedCluster.class.getName() : DocumentalMergedCluster.class.getName() ); globalProperties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); LOGGER.debug( "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap( globalProperties ) ); } public static void main( final String arg[] ) throws ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, Exception { SimpleJSAP jsap = new SimpleJSAP( PartitionDocumentally.class.getName(), "Partitions an index documentally.", new Parameter[] { new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ), new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ), new FlaggedOption( "strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "strategy", "A serialised documental partitioning strategy." ), new FlaggedOption( "uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uniform", "Requires a uniform partitioning in the given number of parts." ), new FlaggedOption( "bloom", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'B', "bloom", "Generates Bloom filters with given precision." ), new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ), new Switch( "skips", JSAP.NO_SHORTFLAG, "skips", "Requires skips (which however are present by default, unless you required an interleaved index)." ), new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ), new FlaggedOption( "quantum", JSAP.INTSIZE_PARSER, "64", JSAP.NOT_REQUIRED, 'Q', "quantum", "The skip quantum." ), new FlaggedOption( "height", JSAP.INTSIZE_PARSER, "8", JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ), new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ), new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ), new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; String inputBasename = jsapResult.getString( "inputBasename" ); String outputBasename = jsapResult.getString( "outputBasename" ); String strategyFilename = jsapResult.getString( "strategy" ); DocumentalPartitioningStrategy strategy = null; if ( jsapResult.userSpecified( "uniformStrategy" ) ) { strategy = DocumentalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), Index.getInstance( inputBasename ).numberOfDocuments ); BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION ); } else if ( strategyFilename != null ) strategy = (DocumentalPartitioningStrategy)BinIO.loadObject( strategyFilename ); else throw new IllegalArgumentException( "You must specify a partitioning strategy" ); final boolean skips = jsapResult.getBoolean( "skips" ); final boolean interleaved = jsapResult.getBoolean( "interleaved" ); if ( interleaved && ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) { System.err.println( "You specified quantum or height, but did not turn on skips." ); return; } new PartitionDocumentally( inputBasename, outputBasename, strategy, strategyFilename, jsapResult.getInt( "bloom" ), jsapResult.getInt( "bufferSize" ), CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ), interleaved, skips, jsapResult.getInt( "quantum" ), jsapResult.getInt( "height" ), jsapResult.getInt( "skipBufferSize" ), jsapResult.getLong( "logInterval" ) ).run(); } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -