📄 indexbuilder.java
字号:
* during index construction wil not be deleted. * * @param keepBatches the new value for the “keep batches” flag. * @return this index builder. */ public IndexBuilder keepBatches( final boolean keepBatches ) { this.keepBatches = keepBatches; return this; } /** Sets the writer compression flags for standard indices (default: {@link CompressionFlags#DEFAULT_STANDARD_INDEX}). * * @param standardWriterFlags the flags for standard indices. * @return this index builder. */ public IndexBuilder standardWriterFlags( final Map<Component,Coding> standardWriterFlags ) { this.standardWriterFlags = standardWriterFlags; return this; } /** Sets the writer compression flags for payload-based indices (default: {@link CompressionFlags#DEFAULT_PAYLOAD_INDEX}). * * @param payloadWriterFlags the flags for payload-based indices. * @return this index builder. */ public IndexBuilder payloadWriterFlags( final Map<Component,Coding> payloadWriterFlags ) { this.payloadWriterFlags = payloadWriterFlags; return this; } /** Sets the skip flag (default: false). If true, the index will be a {@link BitStreamIndex} with skips. The * flag is a no-op unless you require an {@linkplain #interleaved(boolean) interleaved index}. * * @param skips the new value for the skip flag. * @return this index builder. */ public IndexBuilder skips( final boolean skips ) { this.skips = skips; return this; } /** Sets the interleaved flag (default: false). If true, the index will be an {@linkplain BitStreamIndexWriter interleaved index}. * * @param interleaved the new value for the interleaved flag. * @return this index builder. */ public IndexBuilder interleaved( final boolean interleaved ) { this.interleaved = interleaved; return this; } /** Sets the skip quantum (default: {@link BitStreamIndex#DEFAULT_QUANTUM}). * * @param quantum the skip quantum. * @return this index builder. */ public IndexBuilder quantum( final int quantum ) { this.quantum = quantum; return this; } /** Sets the skip height (default: {@link BitStreamIndex#DEFAULT_HEIGHT}). * * @param height the skip height. * @return this index builder. */ public IndexBuilder height( final int height ) { this.height = height; return this; } /** Sets the name of a file containing a map on the document indices (default: <code>null</code>). * * <p>The provided file must containing integers in {@link DataOutput} format. They must by as * many as the number of documents in the collection provided at construction time, and the * resulting function must be injective (i.e., there must be no duplicates). * * @param mapFile a file representing a document map (or <code>null</code> for no mapping). * @return this index builder. */ public IndexBuilder mapFile( final String mapFile ) { this.mapFile = mapFile; return this; } /** Sets the logging time interval (default: {@link ProgressLogger#DEFAULT_LOG_INTERVAL}). * * @param logInterval the logging time interval. * @return this index builder. */ public IndexBuilder logInterval( final long logInterval ) { this.logInterval = logInterval; return this; } /** Sets the temporary directory for batches (default: the directory containing the basename). * * @param batchDirName the name of the temporary directory for batches, or <code>null</code> for the directory containing the basename. * @return this index builder. */ public IndexBuilder batchDirName( final String batchDirName ) { this.batchDirName = batchDirName; return this; } /** Sets the class used to build the index term map (default: {@link ImmutableExternalPrefixMap}). * * <p>The only requirement for <code>termMapClass</code> (besides, of course, implementing {@link StringMap}) * is that of having a public constructor accepting a single parameter of type <samp>{@link Iterable}<{@link CharSequence}></samp>. * * @param termMapClass the class used to build the index term map. * @return this index builder. */ public IndexBuilder termMapClass( final Class<? extends StringMap<? extends CharSequence>> termMapClass ) { this.termMapClass = termMapClass; try { termMapClass.getConstructor( Iterable.class ); } catch ( Exception e ) { throw new IllegalArgumentException( "Class " + termMapClass + " have no constructor accepting an Iterable" ); } return this; } /** Builds the index. * * <p>This method simply invokes {@link Scan} and {@link Combine} using the internally stored settings, and * finally builds a {@link StringMap}. * * <p>If the provided document sequence can be iterated over several times, this method can be called several * times, too, rebuilding each time the index. */ public void run() throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { final DocumentFactory factory = documentSequence.factory(); if ( indexedFields.isEmpty() ) { // We index everything for( int i = 0; i < factory.numberOfFields(); i++ ) if ( factory.fieldType( i ) != FieldType.VIRTUAL || virtualDocumentResolvers.containsKey( i ) ) indexedFields.add( i ); } final int[] indexedField = indexedFields.toIntArray(); final String[] basenameField = new String[ indexedField.length ]; for( int i = 0; i < indexedField.length; i++ ) basenameField[ i ] = basename + "-" + factory.fieldName( indexedField[ i ] ); LOGGER.info( "Creating indices " + Arrays.toString( basenameField ) + "..." ); // Create gap array final int[] virtualDocumentGap = new int[ indexedField.length ]; for( int i = 0; i < indexedField.length; i++ ) virtualDocumentGap[ i ] = virtualDocumentGaps.get( i ); // Create virtual document resolver array final VirtualDocumentResolver[] virtualDocumentResolver = new VirtualDocumentResolver[ indexedField.length ]; for( int i: virtualDocumentResolvers.keySet() ) virtualDocumentResolver[ i ] = virtualDocumentResolvers.get( i ); Scan.run( basename, documentSequence, termProcessor, zipCollectionBasename, scanBufferSize, documentsPerBatch, indexedField, virtualDocumentResolver, virtualDocumentGap, mapFile, logInterval, batchDirName); if ( virtualDocumentResolver != null ) ObjectArrays.fill( virtualDocumentResolver, null ); // Let's keep the garbage collector happy final File batchDir = batchDirName == null ? null : new File( batchDirName ); for ( int i = 0; i < indexedField.length; i++ ) { final int batches; if ( factory.fieldType( indexedField[ i ] ) == DocumentFactory.FieldType.VIRTUAL ) { batches = new Properties( basenameField[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION ).getInt( Index.PropertyKeys.BATCHES ); final String[] inputBasename = new String[ batches ]; for( int j = 0; j < inputBasename.length; j++ ) inputBasename[ j ] = Scan.batchBasename( j, basenameField[ i ], batchDir ); new Paste( basenameField[ i ], inputBasename, false, combineBufferSize, batchDir, pasteBufferSize, standardWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run(); } else { final String[] inputBasename = new Properties( basenameField[ i ] + Scan.CLUSTER_PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX ); batches = inputBasename.length; if ( factory.fieldType( indexedField[ i ] ) == DocumentFactory.FieldType.TEXT ) { if ( mapFile != null ) new Merge( basenameField[ i ], inputBasename, false, combineBufferSize, standardWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run(); else new Concatenate( basenameField[ i ], inputBasename, false, combineBufferSize, standardWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run(); } else { if ( mapFile != null ) new Merge( basenameField[ i ], inputBasename, false, combineBufferSize, payloadWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run(); else new Concatenate( basenameField[ i ], inputBasename, false, combineBufferSize, payloadWriterFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval ).run(); } } // TODO: this is a bit dirty, because in the else above we actually use the batch names found in the cluster property files. if ( ! keepBatches ) Scan.cleanup( basenameField[ i ], batches, batchDir ); } LOGGER.info( "Creating term maps (class: " + termMapClass.getSimpleName() + ")..." ); for( int i = 0; i < indexedField.length; i++ ) BinIO.storeObject( StringMaps.synchronize( termMapClass.getConstructor( Iterable.class ).newInstance( new FileLinesCollection( basenameField[ i ] + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ) ) ), basenameField[ i ] + DiskBasedIndex.TERMMAP_EXTENSION ); LOGGER.info( "Indexing completed." ); } @SuppressWarnings("unchecked") public static void main( final String[] arg ) throws JSAPException, InvocationTargetException, NoSuchMethodException, IllegalAccessException, ConfigurationException, ClassNotFoundException, IOException, InstantiationException, URISyntaxException { SimpleJSAP jsap = new SimpleJSAP( IndexBuilder.class.getName(), "Builds an index (creates batches, combines them, and builds a term map).", new Parameter[] { new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'S', "sequence", "A serialised document sequence that will be used instead of stdin." ), new FlaggedOption( "objectSequence", new ObjectParser( DocumentSequence.class, MG4JClassParser.PACKAGE ), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "object-sequence", "An object specification describing a document sequence that will be used instead of stdin." ), new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ), new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ), new FlaggedOption( "termProcessor", JSAP.STRING_PARSER, NullTermProcessor.class.getName(), JSAP.NOT_REQUIRED, 't', "term-processor", "Sets the term processor to the given class." ), new FlaggedOption( "termMap", MG4JClassParser.getParser(), ImmutableExternalPrefixMap.class.getName(), JSAP.NOT_REQUIRED, 'm', "term-map", "Sets the term map class." ), new Switch( "downcase", JSAP.NO_SHORTFLAG, "downcase", "A shortcut for setting the term processor to the downcasing processor." ), new FlaggedOption( "indexedField", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'I', "indexed-field", "The field(s) of the document factory that will be indexed. (default: all fields)" ).setAllowMultipleDeclarations( true ), new Switch( "allFields", 'a', "all-fields", "Index also all virtual fields; has no effect if indexedField has been used at least once." ), new FlaggedOption( "batchSize", JSAP.INTSIZE_PARSER, Integer.toString( Scan.DEFAULT_BATCH_SIZE ), JSAP.NOT_REQUIRED, 's', "batch-size", "The size of a batch, in documents. (default: " + Scan.DEFAULT_BATCH_SIZE + ")" ), new Switch( "keepBatches", JSAP.NO_SHORTFLAG, "keep-batches", "Do not delete intermediate batch files." ), new FlaggedOption( "virtualDocumentResolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'v', "virtual-document-resolver", "The virtual document resolver. It can be specified several times in the form [<field>:]<filename>. If the field is omitted, it sets the document resolver for all virtual fields." ).setAllowMultipleDeclarations( true ), new FlaggedOption( "virtualDocumentGap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "virtual-document-gap", "The virtual document gap. It can be specified several times in the form [<field>:]<gap>. If the field is omitted, it sets the document gap for all virtual fields; the default gap is " + Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP ).setAllowMultipleDeclarations( true ), new FlaggedOption( "scanBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Scan.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "scan-buffer-size", "The size of an I/O buffer for the scanning phase." ), new FlaggedOption( "combineBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Combine.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'B', "combine-buffer-size", "The size of an I/O buffer for the combination phase." ), new FlaggedOption( "pasteBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Paste.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "paste-buffer-size", "The size of the internal temporary buffer used while pasting indices." ), new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ), new FlaggedOption( "delimiter", JSAP.INTEGER_PARSER, Integer.toString( Scan.DEFAULT_DELIMITER ), JSAP.NOT_REQUIRED, 'd', "delimiter", "The document delimiter." ), new FlaggedOption( "renumber", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'r', "renumber", "The filename of a document renumbering." ), new FlaggedOption( "zipCollection", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'z', "zip", "Creates a support ZipDocumentCollection with given basename." ), new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for textual indices (may be specified several times)." ).setAllowMultipleDeclarations( true ), new FlaggedOption( "payloadComp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'C', "comp-payload", "A compression flag for payload indices (may be specified several times)." ).setAllowMultipleDeclarations( true ), new Switch( "skips", JSAP.NO_SHORTFLAG, "skips", "Requires skips (which however are present by default, unless you required an interleaved index)." ), new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ), new FlaggedOption( "quantum", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_QUANTUM ), JSAP.NOT_REQUIRED, 'Q', "quantum", "Enable skips with given quantum." ), new FlaggedOption( "height", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_HEIGHT ), JSAP.NOT_REQUIRED, 'H', "height", "Enable skips with given height." ), new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ), new FlaggedOption( "tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for all temporary batch files." ), new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" ); final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER ); final DocumentFactory factory = documentSequence.factory(); final int[] indexedField = Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" ) ); final VirtualDocumentResolver[] virtualDocumentResolver = Scan.parseVirtualDocumentResolver( jsapResult.getStringArray( "virtualDocumentResolver" ), indexedField, factory ); final int[] virtualDocumentGap = Scan.parseVirtualDocumentGap( jsapResult.getStringArray( "virtualDocumentGap" ), indexedField, factory ); final TermProcessor termProcessor = jsapResult.getBoolean( "downcase" ) ? DowncaseTermProcessor.getInstance() : ObjectParser.fromSpec( jsapResult.getString( "termProcessor" ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } ); final boolean skips = jsapResult.getBoolean( "skips" ); final boolean interleaved = jsapResult.getBoolean( "interleaved" ); if ( interleaved && ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) { System.err.println( "You specified quantum or height, but did not turn on skips." ); return; } IndexBuilder indexBuilder = new IndexBuilder( jsapResult.getString( "basename" ), documentSequence ) .termProcessor( termProcessor ) .zipCollectionBasename( jsapResult.getString( "zipCollection" ) ) .scanBufferSize( jsapResult.getInt( "scanBufferSize" ) ) .skipBufferSize( jsapResult.getInt( "skipBufferSize" ) ) .pasteBufferSize( jsapResult.getInt( "pasteBufferSize" ) ) .combineBufferSize( jsapResult.getInt( "combineBufferSize" ) ) .documentsPerBatch( jsapResult.getInt( "batchSize" ) ) .keepBatches( jsapResult.getBoolean( "keepBatches" ) ) .termMapClass( jsapResult.getClass( "termMap" ) ) .indexedFields( indexedField ) .skips( skips ) .interleaved( interleaved ) .quantum( jsapResult.getInt( "quantum" ) ) .height( jsapResult.getInt( "height" ) ) .logInterval( jsapResult.getLong( "logInterval" ) ) .batchDirName( jsapResult.getString( "tempDir" ) ); for( int i = 0; i < virtualDocumentResolver.length; i++ ) if ( virtualDocumentResolver[ i ] != null ) indexBuilder.virtualDocumentResolvers.put( i, virtualDocumentResolver[ i ] ); for( int i = 0; i < virtualDocumentGap.length; i++ ) indexBuilder.virtualDocumentGaps.put( i, virtualDocumentGap[ i ] ); if ( jsapResult.userSpecified( "comp" ) ) indexBuilder.standardWriterFlags( CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ) ); if ( jsapResult.userSpecified( "compPayload" ) ) indexBuilder.payloadWriterFlags( CompressionFlags.valueOf( jsapResult.getStringArray( "compPayload" ), CompressionFlags.DEFAULT_PAYLOAD_INDEX ) ); if ( jsapResult.userSpecified( "renumber" ) ) indexBuilder.mapFile( jsapResult.getString( "renumber" ) ); indexBuilder.run(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -