📄 index.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.index;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2004-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.ints.IntList;import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;import it.unimi.dsi.fastutil.objects.ReferenceSet;import it.unimi.dsi.fastutil.objects.ReferenceSets;import it.unimi.dsi.lang.ObjectParser;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.mg4j.index.remote.IndexServer;import it.unimi.dsi.mg4j.search.DocumentIterator;import it.unimi.dsi.mg4j.search.IntervalIterator;import it.unimi.dsi.mg4j.search.visitor.DocumentIteratorVisitor;import it.unimi.dsi.mg4j.util.MG4JClassParser;import it.unimi.dsi.Util;import it.unimi.dsi.util.ImmutableExternalPrefixMap;import it.unimi.dsi.util.Properties;import it.unimi.dsi.util.StringMap;import it.unimi.dsi.util.PrefixMap;import it.unimi.dsi.util.StringMaps;import java.io.IOException;import java.io.Serializable;import java.lang.reflect.InvocationTargetException;import java.net.URI;import java.net.URISyntaxException;import java.util.EnumMap;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;/** An abstract representation of an index. * * <P>Concrete subclasses of this class represent abstract index access * information: for instance, the basename or IP address/port, * flags, etc. It allows to build easily {@linkplain IndexReader index readers} over the index: * in turn, index readers provide {@linkplain it.unimi.dsi.mg4j.search.DocumentIterator document iterators}. *  * <P>In principle, this class should just contain methods declarations, * and attributes for all data that is common to any form of index. * Note that we use an abstract class, rather than an interface, because * interfaces do not allow to declare attributes.  *  * <P>This class provide static factory methods (e.g., {@link #getInstance(CharSequence)}) * that return an index given a suitable URI string. If the scheme part is <samp>mg4j</samp>, then * the URI is assumed to point at a remote index. Otherwise, it is assumed to be the * basename of a local index. In both cases, a query part introduced by <samp>?</samp> can * specify additional parameters (<samp><var>key</var>=<var>value</var></samp> pairs separated * by <samp>;</samp>). For instance, the URI <samp>example?inmemory=1</samp> will load * the index with basename <samp>example</samp>, caching its content in core memory. * Please have a look at constants in {@link Index.UriKeys}  * (and analogous enums in subclasses) for additional parameters. * * <h2>Thread safety</h2> *  * <p>Indices are a natural candidate for multithreaded access. An instance of this class * <strong>must</strong> be thread safe as long as external data structures provided to its * constructors are. For instance, the tool {@link it.unimi.dsi.mg4j.tool.IndexBuilder} generates * a {@linkplain StringMaps#synchronize(PrefixMap) synchronized} {@link ImmutableExternalPrefixMap} * so that by default the resulting index is thread safe. *  * <p>For instance, a {@link it.unimi.dsi.mg4j.index.DiskBasedIndex} requires a list of * term offsets, term maps, etc. As long as all these data structures are thread safe, the * same is true of the index. Data structures created by static factory methods such as * {@link it.unimi.dsi.mg4j.index.DiskBasedIndex#getInstance(CharSequence)} are thread safe. *  * <p>Note that {@link it.unimi.dsi.mg4j.index.IndexReader}s returned by {@link #getReader()} * are <em>not</em> thread safe (even if the method {@link #getReader()} is). The logic behind * this arrangement is that you create as many reader as you need, and then {@link java.io.Closeable#close()} them. In a multithreaded * environment, a pool of index readers can be created, and a custom {@link it.unimi.dsi.mg4j.query.nodes.QueryBuilderVisitor} * can be used to build {@link it.unimi.dsi.mg4j.search.DocumentIterator}s using the given pool of readers. In * this case readers are not closed, but rather reused. *  * <h2>Read-once load</h2> *  * <p>Implementations of this class are strongly encouraged to offer <em>read-once</em> constructors * and factory methods: property files and other data related to the index (but not to an {@link it.unimi.dsi.mg4j.index.IndexReader} * should be read exactly once, and sequentially. This feature is very useful when  * {@linkplain it.unimi.dsi.mg4j.tool.Combine combining indices}. *  * @author Paolo Boldi * @author Sebastiano Vigna * @since 0.9 */public abstract class Index implements Serializable {	private static final Logger LOGGER = Util.getLogger( Index.class );	private static final long serialVersionUID = 0;	/** Symbolic names for properties of a {@link it.unimi.dsi.mg4j.index.Index}. */	public static enum PropertyKeys {		/** The number of documents in the collection. */		DOCUMENTS,		/** The number of terms in the collection. */		TERMS,		/** The number of occurrences in the collection. */		OCCURRENCES,		/** The number of postings (pairs term/document) in the collection. */		POSTINGS,		/** The number of batches this index was (or should be) built from. */		BATCHES,		/** The maximum count. */		MAXCOUNT,		/** The maximum size (in words) of a document. */		MAXDOCSIZE,		/** Whether the index is case sensitive. */		TERMPROCESSOR,		/** A class for the payloads of this index. */		PAYLOADCLASS,		/** The specification of a compressiong flag. This property can be specified		 * as many time as necessary (e.g., <samp>FREQUENCIES:GAMMA</samp>, <samp>POINTERS:GOLOMB</samp>, etc.). */		CODING,				/** The name of the {@link Index} class. */		INDEXCLASS,					/** The name of the field indexed by this index, if any. */		FIELD,					/** The size in bits of the index. */		SIZE	}		/** Keys to be used (downcased) in specifiying additional parameters to a MG4J URI. */		public static enum UriKeys {		/** When set, forces loading a local index into core memory. */		INMEMORY,		/** When set, forces to map a local index into core memory. */		MAPPED,		/** The step used for creating the offset {@link it.unimi.dsi.mg4j.util.SemiExternalOffsetList}. If		 * set to zero, the offset list will be entirely loaded into core memory. If negative, the list		 * will be memory-mapped, and the absolute value will be used as step. */		OFFSETSTEP,		/** The name of a sizes file that will be loaded in case of an {@link IndexCluster}. */		SIZES,	}	/** The field indexed by this index, or <code>null</code>. */	public final String field;	/** The properties of this index. It is stored here for convenience (for instance,	 * if custom keys are added to the property file), but it may be <code>null</code>. */	public final Properties properties;	/** The number of documents of the collection. */	public final int numberOfDocuments;	/** The number of terms of the collection. This field might be set to -1 in some cases 	 * (for instance, in certain documental clusters). */	public final int numberOfTerms;	/** The number of occurrences of the collection. */	public final long numberOfOccurrences;	/** The number of postings (pairs term/document) of the collection. */	public final long numberOfPostings;	/** The maximum number of positions in an position list, or -1 if it is unknown. */	public final int maxCount;	/** The payload for this index, or <code>null</code>. */	public final Payload payload;	/** Whether this index contains payloads; if true, {@link #payload} is non-<code>null</code>. */	public final boolean hasPayloads;	/** Whether this index contains counts. */	public final boolean hasCounts;	/** Whether this index contains positions. */	public final boolean hasPositions;	/** The term processor used to build this index. */	public final TermProcessor termProcessor;	/** An immutable singleton set containing just {@link #keyIndex}. */	public ReferenceSet<Index> singletonSet;	/** The index used as a key to retrieve intervals. Usually equal to <code>this</code>, but it is {@linkplain #keyIndex(Index) settable}. */	public Index keyIndex;	/** The size of each document, or <code>null</code> if sizes are not necessary or not loaded in this index. */	public final IntList sizes;	/** Creates a new instance, initialising all fields. */	protected Index( final int numberOfDocuments, final int numberOfTerms, final long numberOfPostings,			final long numberOfOccurrences, final int maxCount,			final Payload payload, final boolean hasCounts, final boolean hasPositions, final TermProcessor termProcessor,			final String field, final IntList sizes, final Properties properties ) {		this.numberOfDocuments = numberOfDocuments;		this.numberOfTerms = numberOfTerms;		this.numberOfPostings = numberOfPostings;		this.numberOfOccurrences = numberOfOccurrences;		this.maxCount = maxCount;		this.payload = payload;		this.hasPayloads = payload != null;		this.hasCounts = hasCounts;		this.hasPositions = hasPositions;		this.termProcessor = termProcessor;		this.field = field;		this.properties = properties;		this.keyIndex = this;		this.singletonSet = ReferenceSets.singleton( this );		this.sizes = sizes;	}	protected static TermProcessor getTermProcessor( final Properties properties ) {		try {			// Catch old property files			if ( properties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) == null ) 				throw new IllegalArgumentException( "No term processor has been specified (most likely, because of an obsolete property file)" );			return ObjectParser.fromSpec( properties.getString( Index.PropertyKeys.TERMPROCESSOR ), TermProcessor.class, MG4JClassParser.PACKAGE, new String[] { "getInstance" } );		}		catch ( Exception e ) {			throw new RuntimeException( e );		}	}	/** Returns a new index using the given URI.	 * 	 * <p>If <code>uri</code> has scheme <samp>mg4j</samp>, the index is considered to be remote	 * and index creation delegated to {@link IndexServer#getIndex(String, int, boolean, boolean)}. Otherwise,	 * we delegate to {@link DiskBasedIndex#getInstance(CharSequence, boolean, boolean, boolean, EnumMap)}.	 * 	 * @param uri the URI defining the index.	 * @param randomAccess whether the index should be accessible randomly.	 * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes	 * might be loaded anyway because the compression method for positions requires it).	 * @param maps if true, {@linkplain StringMap term} and {@linkplain PrefixMap prefix} maps will be guessed and loaded (this	 * feature might not be available with some kind of index). 	 */
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -