📄 partitiondocumentally.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
			// ALERT: for the time being, we decide whether to "fill the gaps" in sizes using as sole indicator the equality between global and local number of documents.			int size, localIndex;			if ( globalIndex.numberOfDocuments == strategy.numberOfDocuments( 0 ) ) {				for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {					localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );					if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;					for( int l = numIndices; l-- != 0; ) if ( l != localIndex ) localSizes[ l ].writeGamma( 0 ); 				}			}			else { 				for( int i = 0; i < globalIndex.numberOfDocuments; i++ ) {					localSizes[ localIndex = strategy.localIndex( i ) ].writeGamma( size = sizes.readGamma() );					if ( maxDocSize[ localIndex ] < size ) maxDocSize[ localIndex ] = size;				}			}			sizes.close();			for ( int i = 0; i < numIndices; i++ ) localSizes[ i ].close();		}	}		public void run() throws Exception {		final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );		final IntList sizeList = globalIndex.sizes;		partitionSizes();				final int[] position = new int[ globalIndex.maxCount ];  		final int[] localFrequency = new int[ numIndices ];  		final int[] usedIndex = new int[ numIndices ];		final InputBitStream[] direct = new InputBitStream[ numIndices ];		final InputBitStream[] indirect = new InputBitStream[ numIndices ];		final BloomFilter[] bloomFilter = bloomFilterPrecision != 0 ? new BloomFilter[ numIndices ] : null;		final File[] tempFile = new File[ numIndices ];		final CachingOutputBitStream[] temp = new CachingOutputBitStream[ numIndices ];		IndexIterator indexIterator;				for ( int i = 0; i < numIndices; i++ ) {			tempFile[ i ] = new File( localBasename[ i ] + ".temp" );			temp[ i ] = new CachingOutputBitStream( tempFile[ i ], bufferSize );			direct[ i ] = new InputBitStream( temp[ i ].buffer() );			indirect[ i ] = new InputBitStream( tempFile[ i ] );			if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );		}		int usedIndices;		MutableString currentTerm = new MutableString();		Payload payload = null;		int frequency, globalPointer, localIndex, localPointer, count = -1;		pl.expectedUpdates = globalIndex.numberOfPostings;		pl.itemsName = "postings";		pl.logInterval = logInterval;		pl.start( "Partitioning index..." );		for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {			terms.readLine( currentTerm );			indexIterator = indexReader.nextIterator();			usedIndices = 0;			frequency = indexIterator.frequency();						for ( int j = 0; j < frequency; j++ ) {				globalPointer = indexIterator.nextDocument();												localIndex = strategy.localIndex( globalPointer );					if ( localFrequency[ localIndex ] == 0 ) {					// First time we see a document for this index.					currentTerm.println( localTerms[ localIndex ] );					numTerms[ localIndex ]++;					usedIndex[ usedIndices++ ] = localIndex;					if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );				}								/* Store temporarily posting data; note that we save the global pointer as we				 * will have to access the size list. */								localFrequency[ localIndex ]++;				numPostings[ localIndex ]++;				temp[ localIndex ].writeGamma( globalPointer );				if ( globalIndex.hasPayloads ) payload = indexIterator.payload();				if ( havePayloads ) payload.write( temp[ localIndex ] );								if ( haveCounts ) {					count = indexIterator.count();					temp[ localIndex ].writeGamma( count );					globCount[ localIndex ] += count;									if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count; 									if ( havePositions ) {						final int[] pos = indexIterator.positionArray();						// TODO: compress this stuff						for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] ); 					}				}			}						// We now run through the indices used by this term and copy from the temporary buffer.			OutputBitStream obs;						for( int k = 0; k < usedIndices; k++ ) {				final int i = usedIndex[ k ];				localFrequencies[ i ].writeGamma( localFrequency[ i ] );				if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];				if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );				globCount[ i ] = 0;								InputBitStream ibs;				indexWriter[ i ].newInvertedList();				temp[ i ].align();				if ( temp[ i ].buffer() != null ) ibs = direct[ i ];				else {					// We cannot read directly from the internal buffer.					ibs = indirect[ i ];					ibs.flush();					temp[ i ].flush();				}				ibs.position( 0 );									indexWriter[ i ].writeFrequency( localFrequency[ i ] );				for( int j = 0; j < localFrequency[ i ]; j++ ) {					obs = indexWriter[ i ].newDocumentRecord();					globalPointer = ibs.readGamma();					localPointer = strategy.localPointer( globalPointer );						indexWriter[ i ].writeDocumentPointer( obs, localPointer );					if ( havePayloads ) {						payload.read( ibs );						indexWriter[ i ].writePayload( obs, payload );					}					if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );					if ( havePositions ) {						for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();						indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );					}									}				temp[ i ].position( 0 );				temp[ i ].writtenBits( 0 );				localFrequency[ i ] = 0;			}						usedIndices = 0;			pl.count += frequency - 1;			pl.update();		}		pl.done();		Properties globalProperties = new Properties();		globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );		globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );				for ( int i = 0; i < numIndices; i++ ) {			localFrequencies[ i ].close();			if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].close();			localTerms[ i ].close(); 			indexWriter[ i ].close();			if ( bloomFilterPrecision != 0 ) BinIO.storeObject( bloomFilter[ i ], localBasename[ i ] + DocumentalCluster.BLOOM_EXTENSION );			temp[ i ].close();			tempFile[ i ].delete();						Properties localProperties = indexWriter[ i ].properties();			localProperties.addAll( globalProperties );			localProperties.setProperty( Index.PropertyKeys.MAXCOUNT, String.valueOf( maxDocPos[ i ] ) );			localProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize[ i ] );			localProperties.setProperty( Index.PropertyKeys.FIELD, globalProperties.getProperty( Index.PropertyKeys.FIELD ) );			localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[ i ] : -1 );			localProperties.setProperty( Index.PropertyKeys.POSTINGS, numPostings[ i ] );			localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );			if ( havePayloads ) localProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );			if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );			localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );		}		if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );		for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );		globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0 );		// If we partition an index with a single term, by definition we have a flat cluster		globalProperties.setProperty( DocumentalCluster.PropertyKeys.FLAT, inputProperties.getInt( Index.PropertyKeys.TERMS ) <= 1 );		globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, inputProperties.getProperty( Index.PropertyKeys.MAXCOUNT ) );		globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, inputProperties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );		globalProperties.setProperty( Index.PropertyKeys.POSTINGS, inputProperties.getProperty( Index.PropertyKeys.POSTINGS ) );		globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, inputProperties.getProperty( Index.PropertyKeys.OCCURRENCES ) );		globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, inputProperties.getProperty( Index.PropertyKeys.DOCUMENTS ) );		globalProperties.setProperty( Index.PropertyKeys.TERMS, inputProperties.getProperty( Index.PropertyKeys.TERMS ) );		if ( havePayloads ) globalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );		/* For the general case, we must rely on a merged cluster. However, if we detect a contiguous		 * strategy we can optimise a bit. */				globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS, 				strategy instanceof ContiguousDocumentalStrategy ?						DocumentalConcatenatedCluster.class.getName() :						DocumentalMergedCluster.class.getName() );				globalProperties.save(  outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );		LOGGER.debug( "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap( globalProperties ) );			}		public static void main( final String arg[] ) throws ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, Exception {						SimpleJSAP jsap = new SimpleJSAP( PartitionDocumentally.class.getName(), "Partitions an index documentally.",				new Parameter[] {			new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),			new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),			new FlaggedOption( "strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "strategy", "A serialised documental partitioning strategy." ),			new FlaggedOption( "uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uniform", "Requires a uniform partitioning in the given number of parts." ),			new FlaggedOption( "bloom", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 'B', "bloom", "Generates Bloom filters with given precision." ),			new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ),			new Switch( "skips", JSAP.NO_SHORTFLAG, "skips", "Requires skips (which however are present by default, unless you required an interleaved index)." ),			new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),			new FlaggedOption( "quantum", JSAP.INTSIZE_PARSER, "64", JSAP.NOT_REQUIRED, 'Q', "quantum", "The skip quantum." ),			new FlaggedOption( "height", JSAP.INTSIZE_PARSER, "8", JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ),			new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),			new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ),			new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." )		});				JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		String inputBasename = jsapResult.getString( "inputBasename" );		String outputBasename = jsapResult.getString( "outputBasename" );		String strategyFilename = jsapResult.getString( "strategy" );		DocumentalPartitioningStrategy strategy = null;		if ( jsapResult.userSpecified( "uniformStrategy" ) ) {			strategy = DocumentalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), Index.getInstance( inputBasename ).numberOfDocuments );			BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION );		}		else if ( strategyFilename != null ) strategy = (DocumentalPartitioningStrategy)BinIO.loadObject( strategyFilename );		else throw new IllegalArgumentException( "You must specify a partitioning strategy" );				final boolean skips = jsapResult.getBoolean( "skips" );		final boolean interleaved = jsapResult.getBoolean( "interleaved" );		if ( interleaved && ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) {			System.err.println( "You specified quantum or height, but did not turn on skips." );			return;		}		new PartitionDocumentally( inputBasename,				outputBasename, 				strategy, 				strategyFilename,				jsapResult.getInt( "bloom" ),				jsapResult.getInt( "bufferSize" ),				CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),				interleaved,				skips,				jsapResult.getInt( "quantum" ),				jsapResult.getInt( "height" ),				jsapResult.getInt( "skipBufferSize" ),				jsapResult.getLong( "logInterval" ) ).run();	}			}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -