📄 indexbuilder.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.tool;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.Int2IntArrayMap;import it.unimi.dsi.fastutil.ints.Int2IntMap;import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;import it.unimi.dsi.fastutil.ints.Int2ObjectMap;import it.unimi.dsi.fastutil.ints.IntRBTreeSet;import it.unimi.dsi.fastutil.ints.IntSortedSet;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.Object2IntMap;import it.unimi.dsi.fastutil.objects.ObjectArrays;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.document.DocumentSequence;import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;import it.unimi.dsi.mg4j.index.BitStreamHPIndex;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.index.CompressionFlags;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.DowncaseTermProcessor;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.NullTermProcessor;import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;import it.unimi.dsi.mg4j.index.TermProcessor;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.Util;import it.unimi.dsi.mg4j.util.MG4JClassParser;import it.unimi.dsi.lang.ObjectParser;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.ImmutableExternalPrefixMap;import it.unimi.dsi.util.Properties;import it.unimi.dsi.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.StringMap;import it.unimi.dsi.util.StringMaps;import java.io.DataOutput;import java.io.File;import java.io.IOException;import java.lang.reflect.InvocationTargetException;import java.net.URISyntaxException;import java.util.Arrays;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** An index builder.  *  * <p>An instance of this class exposes a {@link #run()} method * that will index the {@link DocumentSequence} provided at construction time * by calling {@link Scan} and {@link Combine} in sequence. *  * <p>Additionally, a main method provides easy access to index construction. *  * <p>All indexing parameters are available * either as chainable setters that can be called optionally before invoking {@link #run()}, or * as public mutable collections and maps. For instance, * <pre> * new IndexBuilder( "foo", sequence ).skips( true ).run(); * </pre> * will build an index with basename <samp>foo</samp> using skips. If instead we want to  * index just the first field of the sequence, and use a {@link ShiftAddXorSignedStringMap} * as a term map, we can use the following code: * <pre> * new IndexBuilder( "foo", sequence ) *     .termMapClass( ShiftAddXorSignedMinimalPerfectHash.class ) *     .indexedFields( 0 ).run(); * </pre> * <p>More sophisticated modifications can be applied using public maps: * <pre> * IndexBuilder indexBuilder = new IndexBuilder( "foo", sequence ); * indexBuilder.virtualDocumentGaps.put( 0, 30 ); * indexBuilder.virtualDocumentResolver.put( 0, someVirtualDocumentResolver ); * indexBuilder.run(); * </pre> * */public class IndexBuilder {	final static Logger LOGGER = Util.getLogger( IndexBuilder.class );	private final String basename;	private final DocumentSequence documentSequence;	private TermProcessor termProcessor = DowncaseTermProcessor.getInstance();	private int documentsPerBatch = Scan.DEFAULT_BATCH_SIZE;	private boolean keepBatches;	private String zipCollectionBasename;	private Map<Component, Coding> standardWriterFlags = CompressionFlags.DEFAULT_STANDARD_INDEX;	private Map<Component, Coding> payloadWriterFlags = CompressionFlags.DEFAULT_PAYLOAD_INDEX;		private boolean skips;	private boolean interleaved;	private int quantum = BitStreamIndex.DEFAULT_QUANTUM;	private int height =BitStreamIndex.DEFAULT_HEIGHT;		private int scanBufferSize = Scan.DEFAULT_BUFFER_SIZE;	private int combineBufferSize = Combine.DEFAULT_BUFFER_SIZE;	private int skipBufferSize = SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE;	private int pasteBufferSize = Paste.DEFAULT_MEMORY_BUFFER_SIZE;	private String batchDirName;	/** The set of indexed fields (expressed as field indices). If left empty, <em>all</em> fields will be indexed,	 * with the proviso that fields of type {@link FieldType#VIRTUAL} will be indexed only	 * if they have a corresponding {@link VirtualDocumentResolver}.	 * 	 * <p>An alternative, chained access to this map is provided by the method {@link #indexedFields(int[])}	 * 	 * <p>After calling {@link #run()}, this map will contain the set of fields actually indexed.	 */		public IntSortedSet indexedFields = new IntRBTreeSet();		/** A map from field indices to a corresponding {@link VirtualDocumentResolver}. */	public Int2ObjectMap<VirtualDocumentResolver> virtualDocumentResolvers = new Int2ObjectArrayMap<VirtualDocumentResolver>();	private Class<? extends StringMap<? extends CharSequence>> termMapClass = ImmutableExternalPrefixMap.class;		/** A map from field indices to virtual gaps. Only values associated to fields of type {@link FieldType#VIRTUAL} are meaningful,	 * and the {@linkplain Object2IntMap#defaultReturnValue() default return value} is set fo {@link Scan#DEFAULT_VIRTUAL_DOCUMENT_GAP}. You	 * can either add entries, or change the default return value. */	public Int2IntMap virtualDocumentGaps = new Int2IntArrayMap();	{		virtualDocumentGaps.defaultReturnValue( Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP );	}		private String mapFile;	private long logInterval = ProgressLogger.DEFAULT_LOG_INTERVAL;	/** Creates a new index builder with default parameters.	 * 	 * <p>Note, in particular, that the resulting index will be a {@linkplain BitStreamHPIndex}	 * (unless you require payloads, in which case it will be a {@link BitStreamIndex} with skips),	 * and that all terms will be {@linkplain DowncaseTermProcessor downcased}. You can set	 * more finely the type of index using {@link #interleaved(boolean)} and {@link #skips(boolean)}.	 * 	 * @param basename the basename from which all files will be stemmed.	 * @param documentSequence the document sequence to be indexed.	 */		public IndexBuilder( String basename, final DocumentSequence documentSequence ) {		this.basename = basename;		this.documentSequence = documentSequence;	}	/** Sets the term processor (default: {@link DowncaseTermProcessor}).	 * 	 * @param termProcessor the term processor.	 * @return this index builder.	 */	public IndexBuilder termProcessor( final TermProcessor termProcessor ) {		this.termProcessor = termProcessor;		return this;	}	/** Sets the indexed fields to those provided (default: all fields, but see {@link #indexedFields}).	 * 	 * <p>This is a utility method that provides a way to set {@link #indexedFields} in a chainable way.	 * 	 * @param field a list of fields to be indexed, that will <em>replace</em> the current values in {@link #indexedFields}.	 * @return this index builder.	 * @see IndexBuilder#indexedFields	 */	public IndexBuilder indexedFields( int... field ) {		indexedFields.clear();		for( int f: field ) indexedFields.add( f );		return this;	}	/** Adds a virtual document resolver to {@link #virtualDocumentResolvers}.	 * 	 * <p>This is a utility method that provides a way to put an element into {@link #virtualDocumentResolvers} in a chainable way.	 * 	 * @param field a field index.	 * @param virtualDocumentResolver a virtual document resolver.	 * @return this index builder.	 * @see IndexBuilder#virtualDocumentResolvers	 */	public IndexBuilder virtualDocumentResolver( final int field, final VirtualDocumentResolver virtualDocumentResolver ) {		virtualDocumentResolvers.put( field, virtualDocumentResolver );		return this;	}	/** Sets the {@link Scan} buffer size (default: {@link Scan#DEFAULT_BUFFER_SIZE}).	 * 	 * @param bufferSize a buffer size for {@link Scan}.	 * @return this index builder.	 */	public IndexBuilder scanBufferSize( final int bufferSize ) {		this.scanBufferSize = bufferSize;		return this;	}		/** Sets the {@link Combine} buffer size (default: {@link Combine#DEFAULT_BUFFER_SIZE}).	 * 	 * @param bufferSize a buffer size for {@link Combine}.	 * @return this index builder.	 */	public IndexBuilder combineBufferSize( final int bufferSize ) {		this.combineBufferSize = bufferSize;		return this;	}		/** Sets both the {@linkplain #scanBufferSize(int) scan buffer size} and the {@linkplain #combineBufferSize(int) combine buffer size}.	 * 	 * @param bufferSize a buffer size.	 * @return this index builder.	 */	public IndexBuilder bufferSize( final int bufferSize ) {		scanBufferSize( bufferSize );		combineBufferSize( bufferSize );		return this;	}	/** Sets the size in byte of the internal buffer using during the construction of a index with skips (default: {@link SkipBitStreamIndexWriter#DEFAULT_TEMP_BUFFER_SIZE}).	 * 	 * @param bufferSize a buffer size for {@link SkipBitStreamIndexWriter}.	 * @return this index builder.	 */	public IndexBuilder skipBufferSize( final int bufferSize ) {		this.skipBufferSize  = bufferSize;		return this;	}		/** Sets the size in byte of the internal buffer using when {@linkplain Paste pasting indices} (default: {@link Paste#DEFAULT_MEMORY_BUFFER_SIZE}).	 * 	 * @param bufferSize a buffer size for {@link Paste}.	 * @return this index builder.	 */	public IndexBuilder pasteBufferSize( final int bufferSize ) {		this.pasteBufferSize  = bufferSize;		return this;	}		/** Sets the name of a zipped collection that will be created during the indexing process (default: <code>null</code>).	 * 	 * @param zipCollectionBasename the basename of a zipped collection, or <code>null</code> for no collection creation.	 * @return this index builder.	 */	public IndexBuilder zipCollectionBasename( final String zipCollectionBasename ) {		this.zipCollectionBasename = zipCollectionBasename;		return this;	}		/** Sets the number of documents per batch (default: {@link Scan#DEFAULT_BATCH_SIZE}).	 * 	 * @param documentsPerBatch the number of documents {@link Scan} will attempt to add to each batch.	 * @return this index builder.	 */	public IndexBuilder documentsPerBatch( final int documentsPerBatch ) {		this.documentsPerBatch = documentsPerBatch;		return this;	}		/** Sets the &ldquo;keep batches&rdquo; flag (default: false). If true, the temporary batch files generated
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -