📄 partitionlexically.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
			termNumber++;		}		terms.close();		for( int i = 0; i < numIndices; i++ ) localTerms[ i ].close();		pl.done();	}		public void run() throws ConfigurationException, IOException, ClassNotFoundException {		final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );		final byte[] buffer = new byte[ bufferSize ];				final OutputBitStream[] localIndexStream = new OutputBitStream[ numIndices ];		final OutputBitStream[] localPositionsStream = new OutputBitStream[ numIndices ];		final OutputBitStream[] localOffsets = new OutputBitStream[ numIndices ];		final OutputBitStream[] localFrequencies = new OutputBitStream[ numIndices ];		final OutputBitStream[] localGlobCounts = new OutputBitStream[ numIndices ];		final PrintWriter[] localTerms = new PrintWriter[ numIndices ]; 		final int numTerms[] = new int[ numIndices ];		final long numberOfOccurrences[] = new long[ numIndices ];		final long numberOfPostings[] = new long[ numIndices ];				final boolean isHighPerformance = BitStreamHPIndex.class.isAssignableFrom( Class.forName( new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).getString( Index.PropertyKeys.INDEXCLASS ) ) );				final InputBitStream globalIndex = new InputBitStream( inputBasename + DiskBasedIndex.INDEX_EXTENSION, bufferSize );		final InputBitStream globalPositions = isHighPerformance ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize ) : null;		final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );		final InputBitStream offsets = new InputBitStream( inputBasename + DiskBasedIndex.OFFSETS_EXTENSION );		final InputBitStream frequencies = new InputBitStream( inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );		final InputBitStream globCounts = new InputBitStream( inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );		offsets.readGamma();				for( int i = 0; i < numIndices; i++ ) {			localIndexStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.INDEX_EXTENSION, bufferSize );			if ( isHighPerformance ) localPositionsStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize );			localFrequencies[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.FREQUENCIES_EXTENSION );			localGlobCounts[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );			localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );			localOffsets[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );			localOffsets[ i ].writeGamma( 0 );		}		// The current term		final MutableString currTerm = new MutableString();				pl.expectedUpdates = ( new File( inputBasename + DiskBasedIndex.INDEX_EXTENSION ).length() + ( isHighPerformance ? new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length() : 0 ) ) * 8;		pl.itemsName = "bits";		pl.logInterval = logInterval;		pl.start( "Partitioning index..." );		int termNumber = 0, k, prevK = -1, previousHeaderLength = 0, newHeaderLength = 0;		long length, count, positionsOffset = 0;		int res, frequency;				while( terms.readLine( currTerm ) != null ) {			k = strategy.localIndex( termNumber ); // The local index for this term			if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();			numTerms[ k ]++;						if ( isHighPerformance ) {				final long temp = globalIndex.readBits();				positionsOffset = globalIndex.readLongDelta();				previousHeaderLength = (int)( globalIndex.readBits() - temp );				if ( prevK != -1 ) {					length = positionsOffset - globalPositions.readBits();					pl.count += length;					while( length > 0 ) {						res = (int)Math.min( bufferSize * 8, length );						globalPositions.read( buffer, res );						localPositionsStream[ prevK ].write( buffer, res );						length -= res;					}				}				newHeaderLength = localIndexStream[ k ].writeLongDelta( localPositionsStream[ k ].writtenBits() );			}									frequency = frequencies.readGamma();			localFrequencies[ k ].writeGamma( frequency );			numberOfPostings[ k ] += frequency;			count = globCounts.readLongGamma();			numberOfOccurrences[ k ] += count;			localGlobCounts[ k ].writeLongGamma( count );						currTerm.println( localTerms[ k ] );						length = offsets.readLongGamma() - previousHeaderLength;			localOffsets[ k ].writeLongGamma( length + newHeaderLength );			pl.count += length + previousHeaderLength - 1;						while( length > 0 ) {				res = (int)Math.min( bufferSize * 8, length );				globalIndex.read( buffer, res );				localIndexStream[ k ].write( buffer, res );				length -= res;			}						pl.update();			prevK = k;			termNumber++;		}		// We pour the last piece of positions		if ( isHighPerformance ) {			if ( prevK != -1 ) {				length = positionsOffset - globalPositions.readBits();				while( length > 0 ) {					res = (int)Math.min( bufferSize * 8, length );					globalIndex.read( buffer, res );					localPositionsStream[ prevK ].write( buffer, res );					length -= res;				}			}		}		pl.done();		terms.close();		offsets.close();		frequencies.close();		globCounts.close();		globalIndex.close();		if ( isHighPerformance ) globalPositions.close();				// We copy the relevant properties from the original 		Properties properties = new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );		Properties globalProperties = new Properties();		if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );		globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, false );		globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS, LexicalCluster.class.getName() );		for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );		globalProperties.setProperty( Index.PropertyKeys.FIELD, properties.getProperty( Index.PropertyKeys.FIELD ) );		globalProperties.setProperty( Index.PropertyKeys.POSTINGS, properties.getProperty( Index.PropertyKeys.POSTINGS ) );		globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, properties.getProperty( Index.PropertyKeys.OCCURRENCES ) );		globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, properties.getProperty( Index.PropertyKeys.DOCUMENTS ) );		globalProperties.setProperty( Index.PropertyKeys.TERMS, properties.getProperty( Index.PropertyKeys.TERMS ) );		globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, properties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );		globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, properties.getProperty( Index.PropertyKeys.MAXCOUNT ) );		globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, properties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );		globalProperties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );		LOGGER.debug( "Properties for clustered index " + outputBasename + ": " + new ConfigurationMap( globalProperties ) );				for( int i = 0; i < numIndices; i++ ) {			localIndexStream[ i ].close();			if ( isHighPerformance ) localPositionsStream[ i ].close();			localOffsets[ i ].close();			localFrequencies[ i ].close();			localGlobCounts[ i ].close();			localTerms[ i ].close();			final InputStream input = new FileInputStream( inputBasename + DiskBasedIndex.SIZES_EXTENSION );			final OutputStream output = new FileOutputStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );			IOUtils.copy( input, output );			input.close();			output.close();			Properties localProperties = new Properties();			localProperties.addAll( globalProperties );			localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );			localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, numberOfOccurrences[ i ] );			localProperties.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings[ i ] );			localProperties.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings[ i ] );			localProperties.setProperty( Index.PropertyKeys.INDEXCLASS, properties.getProperty( Index.PropertyKeys.INDEXCLASS ) );			localProperties.addProperties( Index.PropertyKeys.CODING, properties.getStringArray( Index.PropertyKeys.CODING ) );			localProperties.setProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM, properties.getProperty( BitStreamIndex.PropertyKeys.SKIPQUANTUM ) );			localProperties.setProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT, properties.getProperty( BitStreamIndex.PropertyKeys.SKIPHEIGHT ) );			if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );			localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );			LOGGER.debug( "Post-partitioning properties for index " + localBasename[ i ] + ": " + new ConfigurationMap( localProperties ) );		}	}	public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException {				SimpleJSAP jsap = new SimpleJSAP( PartitionLexically.class.getName(), "Partitions an index lexically.",				new Parameter[] {				new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),				new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),				new FlaggedOption( "strategy", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "strategy", "A serialised lexical partitioning strategy." ),				new FlaggedOption( "uniformStrategy", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "uniform", "Requires a uniform partitioning in the given number of parts." ),				new Switch( "termsOnly", 't', "terms-only", "Just partition the term list." ),				new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the global index." ),				new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the local indices." )		});				JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		String inputBasename = jsapResult.getString( "inputBasename" );		String outputBasename = jsapResult.getString( "outputBasename" );		String strategyFilename = jsapResult.getString( "strategy" );		LexicalPartitioningStrategy strategy = null;		if ( jsapResult.userSpecified( "uniformStrategy" ) ) {			strategy = LexicalStrategies.uniform( jsapResult.getInt( "uniformStrategy" ), DiskBasedIndex.getInstance( inputBasename, false, false, true ) );			BinIO.storeObject( strategy, strategyFilename = outputBasename + IndexCluster.STRATEGY_DEFAULT_EXTENSION );		}		else if ( strategyFilename != null ) strategy = (LexicalPartitioningStrategy)BinIO.loadObject( strategyFilename );		else throw new IllegalArgumentException( "You must specify a splitting strategy" );		final PartitionLexically partitionLexically = new PartitionLexically( inputBasename,				outputBasename, 				strategy, 				strategyFilename,				jsapResult.getInt( "bufferSize" ),				jsapResult.getLong( "logInterval" ) );				if ( jsapResult.getBoolean( "termsOnly" ) ) partitionLexically.runTermsOnly();		else partitionLexically.run();	}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -