📄 combine.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
	protected abstract int combine( int numUsedIndices ) throws IOException;			public void run() throws ConfigurationException, IOException {		final Logger logger = Util.getLogger( this.getClass() );		final ProgressLogger pl = new ProgressLogger( logger, logInterval );		final int maxDocSize;		long totalSize = 0;		if ( writeSizes ) {			size = new int[ numberOfDocuments ];			logger.info( "Combining sizes..." );			maxDocSize = combineSizes();			final OutputBitStream outputSizes = new OutputBitStream( outputBasename + DiskBasedIndex.SIZES_EXTENSION, bufferSize );			for( int i = 0; i < numberOfDocuments; i++ ) {				totalSize += size[ i ];				outputSizes.writeGamma( size[ i ] );			}			outputSizes.close();			logger.info( "Sizes combined." );		}		else maxDocSize = -1;				// To write the global count of each term		final OutputBitStream outputGlobCounts = writeGlobCounts ? new OutputBitStream( outputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ) : null;		// To write the frequency of each term		final OutputBitStream frequencies = metadataOnly ? null : new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );		// To write the new term list		final PrintWriter termFile = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ), bufferSize ) );				// The current term		MutableString currTerm;				// Total number of pointers and occurrences		long numPointers = 0;				pl.expectedUpdates = writeGlobCounts ? numberOfOccurrences : -1;		pl.itemsName = "occurrences";		pl.logInterval = logInterval;		pl.start( "Combining lists..." );		int totalFrequency, numTerms = 0, numUsedIndices, k;		long totalGlobCount = 0;		// TODO: use the front of the queue?		while( ! termQueue.isEmpty() ) {			numUsedIndices = 0;			// We read a new word from the queue, copy it and write it to the term file			currTerm = term[ k = usedIndex[ numUsedIndices++ ] = termQueue.first() ].copy();						if ( DEBUG ) System.err.println( "Merging term " + currTerm );						currTerm.println( termFile );			if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();			else termQueue.changed();						// Then, we extract all equal words from the queue, accumulating the set of indices in inIndex and currIndex			while( ! termQueue.isEmpty() && term[ termQueue.first() ].equals( currTerm ) ) {				k = usedIndex[ numUsedIndices++ ] = termQueue.first();				if ( termReader[ k ].readLine( term[ k ] ) == null ) termQueue.dequeue();				else termQueue.changed();			}						if ( numUsedIndices > 1 ) Arrays.sort( usedIndex, 0, numUsedIndices );			// Load index iterators			for( int i = numUsedIndices; i-- != 0; ) indexIterator[ usedIndex[ i ] ] = indexReader[ usedIndex[ i ] ].nextIterator();			numTerms++;			if ( writeGlobCounts ) {				// Compute and write the total global count. This works for all kind of indices.				totalGlobCount = 0;				for( int i = 0; i < numUsedIndices; i++ ) totalGlobCount += globCounts[ usedIndex[ i ] ].readGamma();				outputGlobCounts.writeLongGamma( totalGlobCount );			}									if ( ! metadataOnly ) {				totalFrequency = combine( numUsedIndices );				frequencies.writeGamma( totalFrequency );				numPointers += totalFrequency;			}			/* A trick to get a correct prediction. */			if ( writeGlobCounts ) pl.count += totalGlobCount - 1;			pl.update();		}		pl.done();				if ( writeGlobCounts ) outputGlobCounts.close();		termFile.close();		if ( ! metadataOnly ) {			frequencies.close();			for( int i = numIndices; i-- != 0; ) {				indexReader[ i ].close();				if ( writeGlobCounts ) globCounts[ i ].close();				termReader[ i ].close();			}			final long indexSize = indexWriter.writtenBits();			indexWriter.close();			final Properties properties = indexWriter.properties();			additionalProperties.setProperty( Index.PropertyKeys.SIZE, indexSize );			additionalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );			additionalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, numberOfOccurrences );			properties.addAll( additionalProperties );			logger.debug( "Post-merge properties: " + new ConfigurationMap( properties ) );			properties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );		}						final PrintStream stats = new PrintStream( new FileOutputStream ( outputBasename + DiskBasedIndex.STATS_EXTENSION ) );		if ( writeSizes ) stats.println( "Average document size: " + Util.format( (double)totalSize / numberOfDocuments ) );		if ( ! metadataOnly ) indexWriter.printStats( stats );		stats.close();	}	public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {		main( arg, null );	}		public static void main( final String[] arg, final Class<? extends Combine> combineClass ) throws JSAPException, ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {				SimpleJSAP jsap = new SimpleJSAP( Combine.class.getName(), "Combines several indices. By default, documents are concatenated, but you can also merge or paste them by choosing the suitable options, or invoking the corresponding subclass instead of " + Combine.class.getName() + ". Note that by combining a single input index you can recompress an index with new parameters.",				new Parameter[] {				new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),				new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ),				new Switch( "skips", JSAP.NO_SHORTFLAG, "skips", "Requires skips (which however are present by default, unless you required an interleaved index)." ),				new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),				new FlaggedOption( "quantum", JSAP.INTSIZE_PARSER, "64", JSAP.NOT_REQUIRED, 'Q', "quantum", "The skip quantum." ),				new FlaggedOption( "height", JSAP.INTSIZE_PARSER, "8", JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ),				new Switch( "metadataOnly", 'o', "metadata-only", "Combines only metadata (sizes, terms and globcounts)." ),				new Switch( "merge", 'm', "merge", "Merges indices (duplicates cause an error)." ),				new Switch( "duplicates", 'd', "duplicates", "Pastes indices, concatenating the document positions for duplicates." ),				new Switch( "properties", 'p', "properties", "The only specified inputBasename will be used to load a property file written by the scanning process." ),				new FlaggedOption( "tempFileDir", FileStringParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "temp-file-dir", "The directory for the temporary file used during pasting." ),				new FlaggedOption( "tempFileBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Paste.DEFAULT_MEMORY_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "temp-file-buffer-size", "The size of the buffer for the temporary file during pasting." ),				new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),				new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),				new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." ),				new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The basenames of the indices to be merged." )		});				JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		final boolean skips = jsapResult.getBoolean( "skips" );		final boolean interleaved = jsapResult.getBoolean( "interleaved" );		if ( interleaved && ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) {			System.err.println( "You specified quantum or height, but did not turn on skips." );			return;		}				if ( combineClass != null && jsapResult.userSpecified( "duplicates" ) || jsapResult.userSpecified( "merge") )			throw new IllegalArgumentException( "When invoking " + Combine.class.getName() + " from " + combineClass.getName() + " you cannot choose the combination process" );				final String[] inputBasename;		if ( jsapResult.getBoolean( "properties" ) ) {			if ( jsapResult.getStringArray( "inputBasename" ).length > 1 ) throw new IllegalArgumentException( "When using --properties, you must specify exactly one inputBasename" );			inputBasename = new Properties( jsapResult.getStringArray( "inputBasename" )[ 0 ] + Scan.CLUSTER_PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );		}		else inputBasename = jsapResult.getStringArray( "inputBasename" ); 		// TODO: resolve problem of passing default flag values without knowing type of index		( combineClass == Paste.class || jsapResult.getBoolean( "duplicates" ) ?		(Combine)new Paste( jsapResult.getString( "outputBasename" ), 				inputBasename,				jsapResult.getBoolean( "metadataOnly" ),				jsapResult.getInt( "bufferSize" ),				jsapResult.getFile( "tempFileDir" ),				jsapResult.getInt( "tempFileBufferSize" ),				CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),				interleaved,				skips,				jsapResult.getInt( "quantum" ),				jsapResult.getInt( "height" ),				jsapResult.getInt( "skipBufferSize" ),				jsapResult.getLong( "logInterval" ) ) :		combineClass == Merge.class || jsapResult.getBoolean( "merge" ) ?				(Combine)new Merge( jsapResult.getString( "outputBasename" ), 						inputBasename,						jsapResult.getBoolean( "metadataOnly" ),						jsapResult.getInt( "bufferSize" ),						CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),						interleaved,						skips,						jsapResult.getInt( "quantum" ),						jsapResult.getInt( "height" ),						jsapResult.getInt( "skipBufferSize" ),						jsapResult.getLong( "logInterval" ) ) :							(Combine)new Concatenate( jsapResult.getString( "outputBasename" ), 									inputBasename,									jsapResult.getBoolean( "metadataOnly" ),									jsapResult.getInt( "bufferSize" ),									CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),									interleaved,									skips,									jsapResult.getInt( "quantum" ),									jsapResult.getInt( "height" ),									jsapResult.getInt( "skipBufferSize" ),									jsapResult.getLong( "logInterval" ) )											).run(); 	}}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -