📄 indexbuilder.java
字号:
package it.unimi.dsi.mg4j.tool;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.Int2IntArrayMap;import it.unimi.dsi.fastutil.ints.Int2IntMap;import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;import it.unimi.dsi.fastutil.ints.Int2ObjectMap;import it.unimi.dsi.fastutil.ints.IntRBTreeSet;import it.unimi.dsi.fastutil.ints.IntSortedSet;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.Object2IntMap;import it.unimi.dsi.fastutil.objects.ObjectArrays;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.document.DocumentSequence;import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;import it.unimi.dsi.mg4j.index.BitStreamHPIndex;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.index.CompressionFlags;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.DowncaseTermProcessor;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.NullTermProcessor;import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;import it.unimi.dsi.mg4j.index.TermProcessor;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.Util;import it.unimi.dsi.mg4j.util.MG4JClassParser;import it.unimi.dsi.lang.ObjectParser;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.ImmutableExternalPrefixMap;import it.unimi.dsi.util.Properties;import it.unimi.dsi.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.StringMap;import it.unimi.dsi.util.StringMaps;import java.io.DataOutput;import java.io.File;import java.io.IOException;import java.lang.reflect.InvocationTargetException;import java.net.URISyntaxException;import java.util.Arrays;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** An index builder. * * <p>An instance of this class exposes a {@link #run()} method * that will index the {@link DocumentSequence} provided at construction time * by calling {@link Scan} and {@link Combine} in sequence. * * <p>Additionally, a main method provides easy access to index construction. * * <p>All indexing parameters are available * either as chainable setters that can be called optionally before invoking {@link #run()}, or * as public mutable collections and maps. For instance, * <pre> * new IndexBuilder( "foo", sequence ).skips( true ).run(); * </pre> * will build an index with basename <samp>foo</samp> using skips. If instead we want to * index just the first field of the sequence, and use a {@link ShiftAddXorSignedStringMap} * as a term map, we can use the following code: * <pre> * new IndexBuilder( "foo", sequence ) * .termMapClass( ShiftAddXorSignedMinimalPerfectHash.class ) * .indexedFields( 0 ).run(); * </pre> * <p>More sophisticated modifications can be applied using public maps: * <pre> * IndexBuilder indexBuilder = new IndexBuilder( "foo", sequence ); * indexBuilder.virtualDocumentGaps.put( 0, 30 ); * indexBuilder.virtualDocumentResolver.put( 0, someVirtualDocumentResolver ); * indexBuilder.run(); * </pre> * */public class IndexBuilder { final static Logger LOGGER = Util.getLogger( IndexBuilder.class ); private final String basename; private final DocumentSequence documentSequence; private TermProcessor termProcessor = DowncaseTermProcessor.getInstance(); private int documentsPerBatch = Scan.DEFAULT_BATCH_SIZE; private boolean keepBatches; private String zipCollectionBasename; private Map<Component, Coding> standardWriterFlags = CompressionFlags.DEFAULT_STANDARD_INDEX; private Map<Component, Coding> payloadWriterFlags = CompressionFlags.DEFAULT_PAYLOAD_INDEX; private boolean skips; private boolean interleaved; private int quantum = BitStreamIndex.DEFAULT_QUANTUM; private int height =BitStreamIndex.DEFAULT_HEIGHT; private int scanBufferSize = Scan.DEFAULT_BUFFER_SIZE; private int combineBufferSize = Combine.DEFAULT_BUFFER_SIZE; private int skipBufferSize = SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE; private int pasteBufferSize = Paste.DEFAULT_MEMORY_BUFFER_SIZE; private String batchDirName; /** The set of indexed fields (expressed as field indices). If left empty, <em>all</em> fields will be indexed, * with the proviso that fields of type {@link FieldType#VIRTUAL} will be indexed only * if they have a corresponding {@link VirtualDocumentResolver}. * * <p>An alternative, chained access to this map is provided by the method {@link #indexedFields(int[])} * * <p>After calling {@link #run()}, this map will contain the set of fields actually indexed. */ public IntSortedSet indexedFields = new IntRBTreeSet(); /** A map from field indices to a corresponding {@link VirtualDocumentResolver}. */ public Int2ObjectMap<VirtualDocumentResolver> virtualDocumentResolvers = new Int2ObjectArrayMap<VirtualDocumentResolver>(); private Class<? extends StringMap<? extends CharSequence>> termMapClass = ImmutableExternalPrefixMap.class; /** A map from field indices to virtual gaps. Only values associated to fields of type {@link FieldType#VIRTUAL} are meaningful, * and the {@linkplain Object2IntMap#defaultReturnValue() default return value} is set fo {@link Scan#DEFAULT_VIRTUAL_DOCUMENT_GAP}. You * can either add entries, or change the default return value. */ public Int2IntMap virtualDocumentGaps = new Int2IntArrayMap(); { virtualDocumentGaps.defaultReturnValue( Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP ); } private String mapFile; private long logInterval = ProgressLogger.DEFAULT_LOG_INTERVAL; /** Creates a new index builder with default parameters. * * <p>Note, in particular, that the resulting index will be a {@linkplain BitStreamHPIndex} * (unless you require payloads, in which case it will be a {@link BitStreamIndex} with skips), * and that all terms will be {@linkplain DowncaseTermProcessor downcased}. You can set * more finely the type of index using {@link #interleaved(boolean)} and {@link #skips(boolean)}. * * @param basename the basename from which all files will be stemmed. * @param documentSequence the document sequence to be indexed. */ public IndexBuilder( String basename, final DocumentSequence documentSequence ) { this.basename = basename; this.documentSequence = documentSequence; } /** Sets the term processor (default: {@link DowncaseTermProcessor}). * * @param termProcessor the term processor. * @return this index builder. */ public IndexBuilder termProcessor( final TermProcessor termProcessor ) { this.termProcessor = termProcessor; return this; } /** Sets the indexed fields to those provided (default: all fields, but see {@link #indexedFields}). * * <p>This is a utility method that provides a way to set {@link #indexedFields} in a chainable way. * * @param field a list of fields to be indexed, that will <em>replace</em> the current values in {@link #indexedFields}. * @return this index builder. * @see IndexBuilder#indexedFields */ public IndexBuilder indexedFields( int... field ) { indexedFields.clear(); for( int f: field ) indexedFields.add( f ); return this; } /** Adds a virtual document resolver to {@link #virtualDocumentResolvers}. * * <p>This is a utility method that provides a way to put an element into {@link #virtualDocumentResolvers} in a chainable way. * * @param field a field index. * @param virtualDocumentResolver a virtual document resolver. * @return this index builder. * @see IndexBuilder#virtualDocumentResolvers */ public IndexBuilder virtualDocumentResolver( final int field, final VirtualDocumentResolver virtualDocumentResolver ) { virtualDocumentResolvers.put( field, virtualDocumentResolver ); return this; } /** Sets the {@link Scan} buffer size (default: {@link Scan#DEFAULT_BUFFER_SIZE}). * * @param bufferSize a buffer size for {@link Scan}. * @return this index builder. */ public IndexBuilder scanBufferSize( final int bufferSize ) { this.scanBufferSize = bufferSize; return this; } /** Sets the {@link Combine} buffer size (default: {@link Combine#DEFAULT_BUFFER_SIZE}). * * @param bufferSize a buffer size for {@link Combine}. * @return this index builder. */ public IndexBuilder combineBufferSize( final int bufferSize ) { this.combineBufferSize = bufferSize; return this; } /** Sets both the {@linkplain #scanBufferSize(int) scan buffer size} and the {@linkplain #combineBufferSize(int) combine buffer size}. * * @param bufferSize a buffer size. * @return this index builder. */ public IndexBuilder bufferSize( final int bufferSize ) { scanBufferSize( bufferSize ); combineBufferSize( bufferSize ); return this; } /** Sets the size in byte of the internal buffer using during the construction of a index with skips (default: {@link SkipBitStreamIndexWriter#DEFAULT_TEMP_BUFFER_SIZE}). * * @param bufferSize a buffer size for {@link SkipBitStreamIndexWriter}. * @return this index builder. */ public IndexBuilder skipBufferSize( final int bufferSize ) { this.skipBufferSize = bufferSize; return this; } /** Sets the size in byte of the internal buffer using when {@linkplain Paste pasting indices} (default: {@link Paste#DEFAULT_MEMORY_BUFFER_SIZE}). * * @param bufferSize a buffer size for {@link Paste}. * @return this index builder. */ public IndexBuilder pasteBufferSize( final int bufferSize ) { this.pasteBufferSize = bufferSize; return this; } /** Sets the name of a zipped collection that will be created during the indexing process (default: <code>null</code>). * * @param zipCollectionBasename the basename of a zipped collection, or <code>null</code> for no collection creation. * @return this index builder. */ public IndexBuilder zipCollectionBasename( final String zipCollectionBasename ) { this.zipCollectionBasename = zipCollectionBasename; return this; } /** Sets the number of documents per batch (default: {@link Scan#DEFAULT_BATCH_SIZE}). * * @param documentsPerBatch the number of documents {@link Scan} will attempt to add to each batch. * @return this index builder. */ public IndexBuilder documentsPerBatch( final int documentsPerBatch ) { this.documentsPerBatch = documentsPerBatch; return this; } /** Sets the “keep batches” flag (default: false). If true, the temporary batch files generated
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -