📄 indexbuilder.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
	 * during index construction wil not be deleted.	 * 	 * @param keepBatches the new value for the &ldquo;keep batches&rdquo; flag.	 * @return this index builder.	 */	public IndexBuilder keepBatches( final boolean keepBatches ) {		this.keepBatches = keepBatches;		return this;	}		/** Sets the writer compression flags for standard indices (default: {@link CompressionFlags#DEFAULT_STANDARD_INDEX}).	 * 	 * @param standardWriterFlags the flags for standard indices.	 * @return this index builder.	 */	public IndexBuilder standardWriterFlags( final Map<Component,Coding> standardWriterFlags ) {		this.standardWriterFlags = standardWriterFlags;		return this;	}		/** Sets the writer compression flags for payload-based indices (default: {@link CompressionFlags#DEFAULT_PAYLOAD_INDEX}).	 * 	 * @param payloadWriterFlags the flags for payload-based indices.	 * @return this index builder.	 */	public IndexBuilder payloadWriterFlags( final Map<Component,Coding> payloadWriterFlags ) {		this.payloadWriterFlags = payloadWriterFlags;		return this;	}		/** Sets the skip flag (default: false). If true, the index will be a {@link BitStreamIndex} with skips. The	 * flag is a no-op unless you require an {@linkplain #interleaved(boolean) interleaved index}.	 * 	 * @param skips the new value for the skip flag.	 * @return this index builder.	 */	public IndexBuilder skips( final boolean skips ) {		this.skips = skips;		return this;	}		/** Sets the interleaved flag (default: false). If true, the index will be an {@linkplain BitStreamIndexWriter interleaved index}.	 * 	 * @param interleaved the new value for the interleaved flag.	 * @return this index builder.	 */	public IndexBuilder interleaved( final boolean interleaved ) {		this.interleaved = interleaved;		return this;	}		/** Sets the skip quantum (default: {@link BitStreamIndex#DEFAULT_QUANTUM}).	 * 	 * @param quantum the skip quantum.	 * @return this index builder.	 */	public IndexBuilder quantum( final int quantum ) {		this.quantum = quantum;		return this;	}		/** Sets the skip height (default: {@link BitStreamIndex#DEFAULT_HEIGHT}).	 * 	 * @param height the skip height.	 * @return this index builder.	 */	public IndexBuilder height( final int height ) {		this.height = height;		return this;	}		/** Sets the name of a file containing a map on the document indices (default: <code>null</code>).	 * 	 * <p>The provided file must containing integers in {@link DataOutput} format. They must by as	 * many as the number of documents in the collection provided at construction time, and the	 * resulting function must be injective (i.e., there must be no duplicates). 	 * 	 * @param mapFile a file representing a document map (or <code>null</code> for no mapping).	 * @return this index builder.	 */	public IndexBuilder mapFile( final String mapFile ) {		this.mapFile = mapFile;		return this;	}	/** Sets the logging time interval (default: {@link ProgressLogger#DEFAULT_LOG_INTERVAL}).	 * 	 * @param logInterval the logging time interval.	 * @return this index builder.	 */	public IndexBuilder logInterval( final long logInterval ) {		this.logInterval = logInterval;		return this;	}		/** Sets the temporary directory for batches (default: the directory containing the basename).	 * 	 * @param batchDirName the name of the temporary directory for batches, or <code>null</code> for the directory containing the basename.	 * @return this index builder.	 */	public IndexBuilder batchDirName( final String batchDirName ) {		this.batchDirName = batchDirName;		return this;	}			/** Sets the class used to build the index term map (default: {@link ImmutableExternalPrefixMap}).	 * 	 * <p>The only requirement for <code>termMapClass</code> (besides, of course, implementing {@link StringMap})	 * is that of having a public constructor accepting a single parameter of type <samp>{@link Iterable}&lt;{@link CharSequence}></samp>.	 * 	 * @param termMapClass the class used to build the index term map.	 * @return this index builder.	 */	public IndexBuilder termMapClass( final Class<? extends StringMap<? extends CharSequence>> termMapClass ) {		this.termMapClass = termMapClass;		try {			termMapClass.getConstructor( Iterable.class );		}		catch ( Exception e ) {			throw new IllegalArgumentException( "Class " + termMapClass + " have no constructor accepting an Iterable" );		}		return this;	}	/** Builds the index.	 * 	 * <p>This method simply invokes {@link Scan} and {@link Combine} using the internally stored settings, and	 * finally builds a {@link StringMap}.	 * 	 * <p>If the provided document sequence can be iterated over several times, this method can be called several	 * times, too, rebuilding each time the index. 	 */	public void run() throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {		final DocumentFactory factory = documentSequence.factory();		if ( indexedFields.isEmpty() ) {			// We index everything			for( int i = 0; i < factory.numberOfFields(); i++ ) 				if ( factory.fieldType( i ) != FieldType.VIRTUAL || virtualDocumentResolvers.containsKey( i ) ) indexedFields.add( i );		}				final int[] indexedField = indexedFields.toIntArray();		final String[] basenameField = new String[ indexedField.length ];		for( int i = 0; i < indexedField.length; i++ ) basenameField[ i ] = basename + "-" + factory.fieldName( indexedField[ i ] );		LOGGER.info( "Creating indices " + Arrays.toString( basenameField ) + "..." );		// Create gap array		final int[] virtualDocumentGap = new int[ indexedField.length ];		for( int i = 0; i < indexedField.length; i++ ) virtualDocumentGap[ i ] = virtualDocumentGaps.get( i ); 				// Create virtual document resolver array		final VirtualDocumentResolver[] virtualDocumentResolver = new VirtualDocumentResolver[ indexedField.length ];		for( int i: virtualDocumentResolvers.keySet() ) virtualDocumentResolver[ i ] = virtualDocumentResolvers.get( i ); 						Scan.run( basename, 				documentSequence, 				termProcessor, 				zipCollectionBasename,				scanBufferSize,				documentsPerBatch,				indexedField,				virtualDocumentResolver,				virtualDocumentGap,				mapFile,				logInterval,				batchDirName);		if ( virtualDocumentResolver != null ) ObjectArrays.fill( virtualDocumentResolver, null ); // Let's keep the garbage collector happy				final File batchDir = batchDirName == null ? null : new File( batchDirName );		for ( int i = 0; i < indexedField.length; i++ ) {			final int batches;			if ( factory.fieldType( indexedField[ i ] ) == DocumentFactory.FieldType.VIRTUAL ) {				batches = new Properties( basenameField[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION ).getInt( Index.PropertyKeys.BATCHES );				final String[] inputBasename = new String[ batches ];				for( int j = 0; j < inputBasename.length; j++ ) inputBasename[ j ] = Scan.batchBasename( j, basenameField[ i ], batchDir ); 				new Paste( basenameField[ i ], inputBasename, false, combineBufferSize, batchDir, pasteBufferSize, standardWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run();			}			else {				final String[] inputBasename = new Properties( basenameField[ i ] + Scan.CLUSTER_PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );				batches = inputBasename.length;				if ( factory.fieldType( indexedField[ i ] ) == DocumentFactory.FieldType.TEXT ) {					if ( mapFile != null ) new Merge( basenameField[ i ], inputBasename, false, combineBufferSize, standardWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run();					else new Concatenate( basenameField[ i ], inputBasename, false, combineBufferSize, standardWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run();				}				else {					if ( mapFile != null ) new Merge( basenameField[ i ], inputBasename, false, combineBufferSize, payloadWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run();					else new Concatenate( basenameField[ i ], inputBasename, false, combineBufferSize, payloadWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run();				} 			}			// TODO: this is a bit dirty, because in the else above we actually use the batch names found in the cluster property files.			if ( ! keepBatches ) Scan.cleanup( basenameField[ i ], batches, batchDir );		}						LOGGER.info( "Creating term maps (class: " + termMapClass.getSimpleName() + ")..." );		for( int i = 0; i < indexedField.length; i++ ) 			BinIO.storeObject( StringMaps.synchronize( termMapClass.getConstructor( Iterable.class ).newInstance( new FileLinesCollection( basenameField[ i ] + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ) ) ), basenameField[ i ] + DiskBasedIndex.TERMMAP_EXTENSION );				LOGGER.info( "Indexing completed." );	}		@SuppressWarnings("unchecked")	public static void main( final String[] arg ) throws JSAPException, InvocationTargetException, NoSuchMethodException, IllegalAccessException, ConfigurationException, ClassNotFoundException, IOException, InstantiationException, URISyntaxException {			SimpleJSAP jsap = new SimpleJSAP( IndexBuilder.class.getName(), "Builds an index (creates batches, combines them, and builds a term map).",				new Parameter[] {				new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'S', "sequence", "A serialised document sequence that will be used instead of stdin." ),				new FlaggedOption( "objectSequence", new ObjectParser( DocumentSequence.class, MG4JClassParser.PACKAGE ), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "object-sequence", "An object specification describing a document sequence that will be used instead of stdin." ),				new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ),				new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ),				new FlaggedOption( "termProcessor", JSAP.STRING_PARSER, NullTermProcessor.class.getName(), JSAP.NOT_REQUIRED, 't', "term-processor", "Sets the term processor to the given class." ),				new FlaggedOption( "termMap", MG4JClassParser.getParser(), ImmutableExternalPrefixMap.class.getName(), JSAP.NOT_REQUIRED, 'm', "term-map", "Sets the term map class." ),				new Switch( "downcase", JSAP.NO_SHORTFLAG, "downcase", "A shortcut for setting the term processor to the downcasing processor." ),				new FlaggedOption( "indexedField", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'I', "indexed-field", "The field(s) of the document factory that will be indexed. (default: all fields)" ).setAllowMultipleDeclarations( true ),				new Switch( "allFields", 'a', "all-fields", "Index also all virtual fields; has no effect if indexedField has been used at least once." ),				new FlaggedOption( "batchSize", JSAP.INTSIZE_PARSER, Integer.toString( Scan.DEFAULT_BATCH_SIZE ), JSAP.NOT_REQUIRED, 's', "batch-size", "The size of a batch, in documents. (default: " + Scan.DEFAULT_BATCH_SIZE + ")" ),				new Switch( "keepBatches", JSAP.NO_SHORTFLAG, "keep-batches", "Do not delete intermediate batch files." ),				new FlaggedOption( "virtualDocumentResolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'v', "virtual-document-resolver", "The virtual document resolver. It can be specified several times in the form [<field>:]<filename>. If the field is omitted, it sets the document resolver for all virtual fields." ).setAllowMultipleDeclarations( true ),				new FlaggedOption( "virtualDocumentGap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "virtual-document-gap", "The virtual document gap. It can be specified several times in the form [<field>:]<gap>. If the field is omitted, it sets the document gap for all virtual fields; the default gap is " + Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP ).setAllowMultipleDeclarations( true ),				new FlaggedOption( "scanBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Scan.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "scan-buffer-size", "The size of an I/O buffer for the scanning phase." ),				new FlaggedOption( "combineBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Combine.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'B', "combine-buffer-size", "The size of an I/O buffer for the combination phase." ),				new FlaggedOption( "pasteBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Paste.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "paste-buffer-size", "The size of the internal temporary buffer used while pasting indices." ),				new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),				new FlaggedOption( "delimiter", JSAP.INTEGER_PARSER, Integer.toString( Scan.DEFAULT_DELIMITER ), JSAP.NOT_REQUIRED, 'd', "delimiter", "The document delimiter." ),				new FlaggedOption( "renumber", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'r', "renumber", "The filename of a document renumbering." ),				new FlaggedOption( "zipCollection", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'z', "zip", "Creates a support ZipDocumentCollection with given basename." ),				new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for textual indices (may be specified several times)." ).setAllowMultipleDeclarations( true ),				new FlaggedOption( "payloadComp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'C', "comp-payload", "A compression flag for payload indices (may be specified several times)." ).setAllowMultipleDeclarations( true ),				new Switch( "skips", JSAP.NO_SHORTFLAG, "skips", "Requires skips (which however are present by default, unless you required an interleaved index)." ),				new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),				new FlaggedOption( "quantum", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_QUANTUM ), JSAP.NOT_REQUIRED, 'Q', "quantum", "Enable skips with given quantum." ),				new FlaggedOption( "height", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_HEIGHT ), JSAP.NOT_REQUIRED, 'H', "height", "Enable skips with given height." ),				new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),				new FlaggedOption( "tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for all temporary batch files." ),				new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." )		});		JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" );				final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );		final DocumentFactory factory = documentSequence.factory();		final int[] indexedField = Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) );		final VirtualDocumentResolver[] virtualDocumentResolver = Scan.parseVirtualDocumentResolver( jsapResult.getStringArray( "virtualDocumentResolver" ), indexedField, factory );		final int[] virtualDocumentGap = Scan.parseVirtualDocumentGap( jsapResult.getStringArray( "virtualDocumentGap" ), indexedField, factory );		final TermProcessor termProcessor = jsapResult.getBoolean( "downcase" ) ? DowncaseTermProcessor.getInstance() :			ObjectParser.fromSpec( jsapResult.getString( "termProcessor" ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ); 		final boolean skips = jsapResult.getBoolean( "skips" );		final boolean interleaved = jsapResult.getBoolean( "interleaved" );		if ( interleaved && ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) {			System.err.println( "You specified quantum or height, but did not turn on skips." );			return;		}		IndexBuilder indexBuilder = new IndexBuilder( jsapResult.getString( "basename" ), documentSequence )		.termProcessor( termProcessor )		.zipCollectionBasename( jsapResult.getString( "zipCollection" ) )		.scanBufferSize( jsapResult.getInt( "scanBufferSize" ) )		.skipBufferSize( jsapResult.getInt( "skipBufferSize" ) )		.pasteBufferSize( jsapResult.getInt( "pasteBufferSize" ) )		.combineBufferSize( jsapResult.getInt( "combineBufferSize" ) )		.documentsPerBatch( jsapResult.getInt( "batchSize" ) )		.keepBatches( jsapResult.getBoolean( "keepBatches" ) )		.termMapClass( jsapResult.getClass( "termMap" ) )		.indexedFields( indexedField )		.skips( skips )		.interleaved( interleaved )		.quantum( jsapResult.getInt( "quantum" ) )		.height( jsapResult.getInt( "height" ) )		.logInterval( jsapResult.getLong( "logInterval" ) )		.batchDirName( jsapResult.getString( "tempDir" ) );				for( int i = 0; i < virtualDocumentResolver.length; i++ ) if ( virtualDocumentResolver[ i ] != null ) indexBuilder.virtualDocumentResolvers.put( i, virtualDocumentResolver[ i ] );		for( int i = 0; i < virtualDocumentGap.length; i++ ) indexBuilder.virtualDocumentGaps.put( i, virtualDocumentGap[ i ] );				if ( jsapResult.userSpecified( "comp" ) ) indexBuilder.standardWriterFlags( CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ) );		if ( jsapResult.userSpecified( "compPayload" ) ) indexBuilder.payloadWriterFlags( CompressionFlags.valueOf( jsapResult.getStringArray( "compPayload" ), CompressionFlags.DEFAULT_PAYLOAD_INDEX ) );		if ( jsapResult.userSpecified( "renumber" ) ) indexBuilder.mapFile( jsapResult.getString( "renumber" ) );				indexBuilder.run();	}}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -