📄 diskbasedindex.java
字号:
package it.unimi.dsi.mg4j.index;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2004-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntArrayList;import it.unimi.dsi.fastutil.ints.IntList;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.longs.LongArrayList;import it.unimi.dsi.fastutil.longs.LongList;import it.unimi.dsi.fastutil.longs.LongLists;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.mg4j.index.Index.UriKeys;import it.unimi.dsi.io.ByteBufferInputStream;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.util.Properties;import it.unimi.dsi.mg4j.util.SemiExternalOffsetList;import it.unimi.dsi.util.StringMap;import it.unimi.dsi.util.PrefixMap;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.nio.ByteBuffer;import java.nio.channels.FileChannel.MapMode;import java.util.EnumMap;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;/** A static container providing facilities to load an index based on data stored on disk. * * <P>This class contains several useful static methods * such as {@link #readOffsets(InputBitStream, int)} and {@link #readSizes(InputBitStream, int)}, * and static factor methods such as {@link #getInstance(CharSequence, boolean, boolean, boolean, EnumMap)} * that take care of reading the properties associated to the index, identify * the correct {@link it.unimi.dsi.mg4j.index.Index} implementation that * should be used to load the index, and load the necessary data into memory. * * <p>As an option, a disk-based index can be <em>loaded</em> into main memory (key: {@link Index.UriKeys#INMEMORY}), returning * an {@link it.unimi.dsi.mg4j.index.InMemoryIndex}/{@link InMemoryHPIndex}, or <em>mapped</em> into main memory (key: {@link Index.UriKeys#MAPPED}), * returning a {@link MemoryMappedIndex}/{@link InMemoryHPIndex} (note that the value assigned to the keys is irrelevant). * In both cases some insurmountable Java problems * prevents using indices whose size exceeds two gigabytes (but see {@link MemoryMappedIndex} for * some elaboration on this topic). * * <p>Moreover, by default the * term-offset list is accessed using a {@link it.unimi.dsi.mg4j.util.SemiExternalOffsetList} * with a step of {@link #DEFAULT_OFFSET_STEP}. This behaviour can be changed using * the URI key {@link UriKeys#OFFSETSTEP}. * * <p>Disk-based indices are the workhorse of MG4J. All other indices (clustered, * remote, etc.) ultimately rely on disk-based indices to provide results. * * <p>Note that not all data produced by {@link it.unimi.dsi.mg4j.tool.Scan} and * by the other indexing utilities are actually necessary to run a disk-based * index. Usually the property file and the index file (plus the positions file, * for {@linkplain BitStreamHPIndex high-performance indices}) are sufficient: if one * needs random access, also the offsets file must be present, and if the * compression method requires document sizes or if sizes are requested explicitly, * also the sizes file must be present. A {@link StringMap} * and possibly a {@link PrefixMap} will be fetched * automatically by {@link #getInstance(CharSequence, boolean, boolean)} * using standard extensions. * * <h2>Thread safety</h2> * * <p>A disk-based index is thread safe as long as the offset list, the size list and * the term/prefix map are. The static factory methods provided by this class load * offsets and sizes using data structures that are thread safe. If you use directly * a constructor, instead, it is your responsability to pass thread-safe data structures. * * @author Sebastiano Vigna * @since 1.1 */public class DiskBasedIndex { private static final Logger LOGGER = Util.getLogger( DiskBasedIndex.class ); private static final long serialVersionUID = 0; /** The default value for the query parameter {@link Index.UriKeys#OFFSETSTEP}. */ public final static int DEFAULT_OFFSET_STEP = 256; /** Standard extension for the index bitstream. */ public static final String INDEX_EXTENSION = ".index"; /** Standard extension for the positions bitstream of an {@linkplain BitStreamHPIndexWriter high-performance index}. */ public static final String POSITIONS_EXTENSION = ".positions"; /** Standard extension for the index properties. */ public static final String PROPERTIES_EXTENSION = ".properties"; /** Standard extension for the file of sizes. */ public static final String SIZES_EXTENSION = ".sizes"; /** Standard extension for the file of offsets. */ public static final String OFFSETS_EXTENSION = ".offsets"; /** Standard extension for the file of global counts. */ public static final String GLOBCOUNTS_EXTENSION = ".globcounts"; /** Standard extension for the file of frequencies. */ public static final String FREQUENCIES_EXTENSION = ".frequencies"; /** Standard extension for the file of terms. */ public static final String TERMS_EXTENSION = ".terms"; /** Standard extension for the file of terms, unsorted. */ public static final String UNSORTED_TERMS_EXTENSION = ".terms.unsorted"; /** Standard extension for the term map. */ public static final String TERMMAP_EXTENSION = ".termmap"; /** Standard extension for the prefix map. */ public static final String PREFIXMAP_EXTENSION = ".prefixmap"; /** Standard extension for the stats file. */ public static final String STATS_EXTENSION = ".stats"; private DiskBasedIndex() {} /** Utility method to load a compressed offset file into a list. * * @param in the input bit stream providing the offsets (see {@link BitStreamIndexWriter}). * @param T the number of terms indexed. * @return a list of longs backed by an array; the list has * an additional final element of index <code>T</code> that gives the number * of bytes of the index file. */ public static LongList readOffsets( final InputBitStream in, final int T ) throws IOException { final long[] offset = new long[ T + 1 ]; LOGGER.debug( "Loading offsets..." ); offset[ 0 ] = in.readLongGamma(); for( int i = 0; i < T; i++ ) offset[ i + 1 ] = in.readLongGamma() + offset[ i ]; LOGGER.debug( "Completed." ); return LongArrayList.wrap( offset ); } /** Utility method to load a compressed size file into a list. * * @param in the input bit stream providing the offsets (see {@link BitStreamIndexWriter}). * @param N the number of documents indexed. * @return a list of integers backed by an array. */ public static IntList readSizes( final InputBitStream in, final int N ) throws IOException { final int[] size = new int[ N ]; LOGGER.debug( "Loading sizes..." ); for( int i = 0; i < N; i++ ) size[ i ] = in.readGamma(); LOGGER.debug( "Completed." ); return IntArrayList.wrap( size ); } /** Utility static method that loads a term map. * * @param filename the name of the file containing the term map. * @return the map, or <code>null</code> if the file did not exist. * @throws IOException if some IOException (other than {@link FileNotFoundException}) occurred. */ @SuppressWarnings("unchecked") public static StringMap<? extends CharSequence> loadStringMap( final String filename ) throws IOException { try { return (StringMap<? extends CharSequence>) BinIO.loadObject( filename ); } catch ( FileNotFoundException e ) { return null; } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); } } /** Utility static method that loads a prefix map. * * @param filename the name of the file containing the prefix map. * @return the map, or <code>null</code> if the file did not exist. * @throws IOException if some IOException (other than {@link FileNotFoundException}) occurred. */ @SuppressWarnings("unchecked") public static PrefixMap<? extends CharSequence> loadPrefixMap( final String filename ) throws IOException { try { return (PrefixMap<? extends CharSequence>) BinIO.loadObject( filename ); } catch ( FileNotFoundException e ) { return null; } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); } } /** Returns a new disk-based index, loading exactly the specified parts and using preloaded {@link Properties}. * * @param basename the basename of the index. * @param properties the properties obtained from the given basename. * @param termMap the term map for this index, or <code>null</code> for no term map. * @param prefixMap the prefix map for this index, or <code>null</code> for no prefix map. * @param randomAccess whether the index should be accessible randomly (e.g., if it will * be possible to call {@link IndexReader#documents(int)} on the index readers returned by the index). * @param documentSizes if true, document sizes will be loaded (note that sometimes document sizes * might be loaded anyway because the compression method for positions requires it). * @param queryProperties a map containing associations between {@link Index.UriKeys} and values, or <code>null</code>. */ public static BitStreamIndex getInstance( final CharSequence basename, Properties properties, final StringMap<? extends CharSequence> termMap, final PrefixMap<? extends CharSequence> prefixMap, final boolean randomAccess, final boolean documentSizes, final EnumMap<UriKeys,String> queryProperties ) throws ClassNotFoundException, IOException, InstantiationException, IllegalAccessException { // This could be null if old indices contain SkipIndex Class<?> indexClass = null; try { indexClass = Class.forName( properties.getString( Index.PropertyKeys.INDEXCLASS, "(missing index class)" )); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -