📄 diskbasedindex.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.index;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2004-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntArrayList;import it.unimi.dsi.fastutil.ints.IntList;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.longs.LongArrayList;import it.unimi.dsi.fastutil.longs.LongList;import it.unimi.dsi.fastutil.longs.LongLists;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.mg4j.index.Index.UriKeys;import it.unimi.dsi.io.ByteBufferInputStream;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.util.Properties;import it.unimi.dsi.mg4j.util.SemiExternalOffsetList;import it.unimi.dsi.util.StringMap;import it.unimi.dsi.util.PrefixMap;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.nio.ByteBuffer;import java.nio.channels.FileChannel.MapMode;import java.util.EnumMap;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;/** A static container providing facilities to load an index based on data stored on disk. * * <P>This class contains several useful static methods * such as {@link #readOffsets(InputBitStream, int)} and {@link #readSizes(InputBitStream, int)}, * and static factor methods such as {@link #getInstance(CharSequence, boolean, boolean, boolean, EnumMap)} * that take care of reading the properties associated to the index, identify * the correct {@link it.unimi.dsi.mg4j.index.Index} implementation that * should be used to load the index, and load the necessary data into memory.  *  * <p>As an option, a disk-based index can be <em>loaded</em> into main memory (key: {@link Index.UriKeys#INMEMORY}), returning * an {@link it.unimi.dsi.mg4j.index.InMemoryIndex}/{@link InMemoryHPIndex}, or <em>mapped</em> into main memory (key: {@link Index.UriKeys#MAPPED}), * returning a {@link MemoryMappedIndex}/{@link InMemoryHPIndex} (note that the value assigned to the keys is irrelevant).  * In both cases some insurmountable Java problems * prevents using indices whose size exceeds two gigabytes (but see {@link MemoryMappedIndex} for * some elaboration on this topic). *   * <p>Moreover, by default the * term-offset list is accessed using a {@link it.unimi.dsi.mg4j.util.SemiExternalOffsetList} * with a step of {@link #DEFAULT_OFFSET_STEP}. This behaviour can be changed using * the URI key {@link UriKeys#OFFSETSTEP}. *  * <p>Disk-based indices are the workhorse of MG4J. All other indices (clustered, * remote, etc.) ultimately rely on disk-based indices to provide results. *  * <p>Note that not all data produced by {@link it.unimi.dsi.mg4j.tool.Scan} and * by the other indexing utilities are actually necessary to run a disk-based * index. Usually the property file and the index file (plus the positions file,  * for {@linkplain BitStreamHPIndex high-performance indices}) are sufficient: if one * needs random access, also the offsets file must be present, and if the * compression method requires document sizes or if sizes are requested explicitly, * also the sizes file must be present. A {@link StringMap} * and possibly a {@link PrefixMap} will be fetched * automatically by {@link #getInstance(CharSequence, boolean, boolean)} * using standard extensions. * * <h2>Thread safety</h2> *  * <p>A disk-based index is thread safe as long as the offset list, the size list and * the term/prefix map are. The static factory methods provided by this class load * offsets and sizes using data structures that are thread safe. If you use directly * a constructor, instead, it is your responsability to pass thread-safe data structures. * * @author Sebastiano Vigna * @since 1.1 */public class DiskBasedIndex {	private static final Logger LOGGER = Util.getLogger( DiskBasedIndex.class );	private static final long serialVersionUID = 0;	/** The default value for the query parameter {@link Index.UriKeys#OFFSETSTEP}. */	public final static int DEFAULT_OFFSET_STEP = 256;	/** Standard extension for the index bitstream. */	public static final String INDEX_EXTENSION = ".index";	/** Standard extension for the positions bitstream of an {@linkplain BitStreamHPIndexWriter high-performance index}. */	public static final String POSITIONS_EXTENSION = ".positions";	/** Standard extension for the index properties. */	public static final String PROPERTIES_EXTENSION = ".properties";	/** Standard extension for the file of sizes. */	public static final String SIZES_EXTENSION = ".sizes";	/** Standard extension for the file of offsets. */	public static final String OFFSETS_EXTENSION = ".offsets";	/** Standard extension for the file of global counts. */	public static final String GLOBCOUNTS_EXTENSION = ".globcounts";	/** Standard extension for the file of frequencies. */	public static final String FREQUENCIES_EXTENSION = ".frequencies";	/** Standard extension for the file of terms. */	public static final String TERMS_EXTENSION = ".terms";	/** Standard extension for the file of terms, unsorted. */	public static final String UNSORTED_TERMS_EXTENSION = ".terms.unsorted";	/** Standard extension for the term map. */	public static final String TERMMAP_EXTENSION = ".termmap";	/** Standard extension for the prefix map. */	public static final String PREFIXMAP_EXTENSION = ".prefixmap";	/** Standard extension for the stats file. */	public static final String STATS_EXTENSION = ".stats";		private DiskBasedIndex() {}		/** Utility method to load a compressed offset file into a list.	 *	 * @param in the input bit stream providing the offsets (see {@link BitStreamIndexWriter}).	 * @param T the number of terms indexed.	 * @return a list of longs backed by an array; the list has	 * an additional final element of index <code>T</code> that gives the number	 * of bytes of the index file.	 */	public static LongList readOffsets( final InputBitStream in, final int T ) throws IOException {		final long[] offset = new long[ T + 1 ];		LOGGER.debug( "Loading offsets..." );		offset[ 0 ] = in.readLongGamma();		for( int i = 0; i < T; i++ ) offset[ i + 1 ] = in.readLongGamma() + offset[ i ];		LOGGER.debug( "Completed." );		return LongArrayList.wrap( offset );	}	/** Utility method to load a compressed size file into a list.	 *	 * @param in the input bit stream providing the offsets (see {@link BitStreamIndexWriter}).	 * @param N the number of documents indexed.	 * @return a list of integers backed by an array.	 */	public static IntList readSizes( final InputBitStream in, final int N ) throws IOException {		final int[] size = new int[ N ];		LOGGER.debug( "Loading sizes..." );		for( int i = 0; i < N; i++ ) size[ i ] = in.readGamma();		  		LOGGER.debug( "Completed." );		return IntArrayList.wrap( size );	}	/** Utility static method that loads a term map.	 * 	 * @param filename the name of the file containing the term map.	 * @return the map, or <code>null</code> if the file did not exist.	 * @throws IOException if some IOException (other than {@link FileNotFoundException}) occurred.	 */	@SuppressWarnings("unchecked")	public static StringMap<? extends CharSequence> loadStringMap( final String filename ) throws IOException {		try {			return (StringMap<? extends CharSequence>) BinIO.loadObject( filename );		} catch ( FileNotFoundException e ) {			return null;		} catch ( ClassNotFoundException e ) {			throw new RuntimeException( e );		}	}	/** Utility static method that loads a prefix map.	 * 	 * @param filename the name of the file containing the prefix map.	 * @return the map, or <code>null</code> if the file did not exist.	 * @throws IOException if some IOException (other than {@link FileNotFoundException}) occurred.	 */	@SuppressWarnings("unchecked")	public static PrefixMap<? extends CharSequence> loadPrefixMap( final String filename ) throws IOException {		try {			return  (PrefixMap<? extends CharSequence>) BinIO.loadObject( filename );		} catch ( FileNotFoundException e ) {			return null;		} catch ( ClassNotFoundException e ) {			throw new RuntimeException( e );		}	}	/** Returns a new disk-based index, loading exactly the specified parts and using preloaded {@link Properties}.	 * 	 * @param basename the basename of the index.	 * @param properties the properties obtained from the given basename.	 * @param termMap the term map for this index, or <code>null</code> for no term map.	 * @param prefixMap the prefix map for this index, or <code>null</code> for no prefix map.	 * @param randomAccess whether the index should be accessible randomly (e.g., if it will	 * be possible to call {@link IndexReader#documents(int)} on the index readers returned by the index).	 * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes	 * might be loaded anyway because the compression method for positions requires it).	 * @param queryProperties a map containing associations between {@link Index.UriKeys} and values, or <code>null</code>.	 */	public static BitStreamIndex getInstance( final CharSequence basename, Properties properties, final StringMap<? extends CharSequence> termMap, final PrefixMap<? extends CharSequence> prefixMap, final boolean randomAccess, final boolean documentSizes, final EnumMap<UriKeys,String> queryProperties ) throws ClassNotFoundException, IOException, InstantiationException, IllegalAccessException {		// This could be null if old indices contain SkipIndex		Class<?> indexClass = null;		try {			indexClass = Class.forName( properties.getString( Index.PropertyKeys.INDEXCLASS, "(missing index class)" ));		}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -