📄 immutableexternalprefixdictionary.java
字号:
package it.unimi.dsi.mg4j.util;/* * MG4J: Managing Gigabytes for Java** Copyright (C) 2005-2007 Sebastiano Vigna ** This library is free software; you can redistribute it and/or modify it* under the terms of the GNU Lesser General Public License as published by the Free* Software Foundation; either version 2.1 of the License, or (at your option)* any later version.** This library is distributed in the hope that it will be useful, but* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License* for more details.** You should have received a copy of the GNU Lesser General Public License* along with this program; if not, write to the Free Software* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.**/import it.unimi.dsi.fastutil.booleans.BooleanIterator;import it.unimi.dsi.fastutil.chars.Char2IntOpenHashMap;import it.unimi.dsi.fastutil.ints.IntArrayList;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;import it.unimi.dsi.fastutil.objects.AbstractObjectList;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.ObjectIterator;import it.unimi.dsi.mg4j.compression.Decoder;import it.unimi.dsi.mg4j.compression.PrefixCodec;import it.unimi.dsi.mg4j.compression.PrefixCoder;import it.unimi.dsi.mg4j.index.PrefixMap;import it.unimi.dsi.mg4j.index.TermMap;import it.unimi.dsi.mg4j.io.FastBufferedReader;import it.unimi.dsi.mg4j.io.FileLinesCollection;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.mg4j.search.Interval;import it.unimi.dsi.mg4j.search.Intervals;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.Serializable;import java.lang.reflect.InvocationTargetException;import java.nio.charset.Charset;import java.util.Arrays;import java.util.Collection;import java.util.Iterator;import java.util.List;import java.util.NoSuchElementException;import java.util.zip.GZIPInputStream;import org.apache.commons.io.IOUtils;import cern.colt.bitvector.BitVector;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;import com.martiansoftware.jsap.stringparsers.ForNameStringParser;/** An immutable prefix dictionary mostly stored in external memory. * * <p>A <em>prefix dictionary</em> is a data structure that stores efficiently a lexicographically ordered list * of character sequences so that it is possible to obtain, given a character sequence <var>s</var>, * its position in the sequence, and also the (possibly empty) interval of the sequence composed by * character sequences that are extensions of <var>s</var>. In other words, a prefix dictionary * is an implementation of a {@link it.unimi.dsi.mg4j.index.TermMap} and of a {@link it.unimi.dsi.mg4j.index.PrefixMap} * at the same time. * * <P>This class releases on a <em>dump stream</em> most of the data that * would be contained in the corresponding internal-memory dictionary * (e.g., a {@link it.unimi.dsi.mg4j.util.TernaryIntervalSearchTree}). * More precisely, each * block (with user-definable length, possibly the size of a basic disk I/O operation) * is filled as much as possible with strings front coded and compressed with a * {@link it.unimi.dsi.mg4j.compression.PrefixCodec} provided by the abstract factory method {@link #getPrefixCodec(int[]) getPrefixCodec()}. * Each block starts with the length of the first string in unary, followed by the encoding of the * string. Then, for each string we write in unary the length of the common prefix (in characters) * with the previous string, the length of the remaning suffix (in characters) * and finally the encoded suffix. Note that if the encoding of a string is longer than a block, the string will occupy more than one block. * * <P>We keep track using an {@link it.unimi.dsi.mg4j.util.ImmutableExternalPrefixDictionary.IntervalApproximator IntervalApproximator} returned * by the abstract factory method {@link #getIntervalApproximator(List, PrefixCoder) getIntervalApproximator()} * of the strings at the start of each block: thus, we are able to retrieve the interval corresponding * to a given prefix by calling {@link it.unimi.dsi.mg4j.util.ImmutableBinaryTrie#getApproximatedInterval(BooleanIterator) getApproximatedInterval()} * and scanning at most two blocks. * * <P>The two abovementioned abstract factory methods * are the only methods that must be implemented by concrete subclasses. For instance, * {@link it.unimi.dsi.mg4j.util.ImmutableExternalTreePrefixDictionary} uses a {@link it.unimi.dsi.mg4j.compression.HuffmanCodec} * to compress the strings, and keeps the block delimiters in a {@link it.unimi.dsi.mg4j.util.TernaryIntervalSearchTree}, whereas * {@link it.unimi.dsi.mg4j.util.ImmutableExternalTriePrefixDictionary} uses a {@link it.unimi.dsi.mg4j.compression.HuTuckerCodec} * to compress the strings, and the same codec to store the block delimiters in an {@link it.unimi.dsi.mg4j.util.ImmutableTriePrefixTree}. * * <p>Besides handling externalisation and compression, this abstract implementation has a number of useful features: * <ul> * <li>it implements a number of interfaces: {@link it.unimi.dsi.fastutil.objects.ObjectArrayList}, * exposing the list of character sequences, and by means of {@link java.util.List#indexOf(java.lang.Object) indexOf()}, * also the inverse mapping; {@link it.unimi.dsi.mg4j.index.TermMap} * and {@link it.unimi.dsi.mg4j.index.PrefixMap}; * <li>it is <em>two way</em>: it also returns, given an index in the * list, the corresponding character sequence, and, given an interval of character sequences, * their (possibly empty) common prefix (i.e., it implements all {@link it.unimi.dsi.mg4j.index.TermMap}'s * and {@link it.unimi.dsi.mg4j.index.PrefixMap}'s optional operations). * <li>it is heavily optimised: for instance {@link #iterator()} will return an iterator * that just scans linearly the dump stream (instead of using the superclass implementation, * which scans the list using calls to {@link #get(int)} in a <code>for</code> loop). * </ul> * * <h3>Self-contained or non-self-contained</h3> * * <P>There are two kinds of external prefix dictionaries: self-contained and non-self-contained. * In the first case, you get a serialised object that you can load at any time. The dump * stream is serialised with the object and expanded at each deserialisation in the Java temporary directory. * If you deserialise a dictionary several times, you will get correspondingly many copies of * the dump stream in the temporary directory. The dump streams are deleted when the JVM * exits. This mechanism is not very efficient, but since this class implements several * interfaces it is essential that clients can make the thing work in a standard way. * * <P>Alternatively, you can give at creation time a filename for the dump stream. * The resulting non-self-contained external prefix dictionary * can be serialised, but after deserialisation * you need to set back the {@linkplain #setDumpStream(CharSequence) dump stream filename} * or even directly the {@linkplain #setDumpStream(InputBitStream) dump stream} (for instance, to * an {@linkplain it.unimi.dsi.mg4j.io.OutputBitStream#OutputBitStream(byte[]) output bit stream * wrapping a byte array where the dump stream has been loaded}). You can deserialise many * copies of an external prefix dictionary, letting all copies share the same dump stream. * * <P>This data structure is not synchronised, and concurrent reads may cause problems * because of clashes in the usage of the underlying input bit stream. It would not * be a good idea in any case to open a new stream for each caller, as that would * certainly lead to disk thrashing. * * <P>The {@linkplain #main(String[]) main method} of this class, as usual with immutable containers in MG4J, * helps in building large external prefix dictionaries. Of course, you must provide a concrete subclass of this class. * * @author Sebastiano Vigna * @since 0.9.3 * @deprecated Moved to <code>dsiutils</code>. */@Deprecatedpublic abstract class ImmutableExternalPrefixDictionary extends AbstractObjectList<CharSequence> implements TermMap, PrefixMap, Serializable { final private static boolean DEBUG = false; final private static boolean ASSERTS = false; public static final long serialVersionUID = 1L; /** A data structure providing queries for approximated prefix intervals. * * <P>An interval approximator contains a list of words, and answers to queries about * <em>approximated prefix intervals</em>. * * <P>Given a word <var>w</var>, the corresponding approximated prefix interval is * defined as follows: if the words in the approximator are thought of as left interval extremes in a * larger lexicographically ordered set of words, and we number these word intervals using the * indices of their left extremes, then the first word extending <var>w</var> would be in the * word interval given by the left extreme of the interval returned by this method, whereas * the last word extending <var>w</var> would be in the word interval given by the right * extreme of the interval returned by this method. If no word in the larger set could possibly extend * <var>w</var> (because <var>w</var> is smaller than the lexicographically smallest word in the approximator) * the result is just an {@linkplain it.unimi.dsi.mg4j.search.Intervals#EMPTY_INTERVAL empty interval}. * * <P>LexicalInterval approximators are used by {@linkplain ImmutableExternalPrefixDictionary external prefix dictionaries} * to locate the disk blocks in which the strings delimiting an interval * might be found. * * @see ImmutableExternalPrefixDictionary */ public interface IntervalApproximator { /** Returns an approximated prefix interval around the specified word. * * @param word a word. * @return an approximated prefix interval for <code>word</code> */ Interval getApproximatedInterval( CharSequence word ); } /** The standard block size (in bytes). */ public final static int STD_BLOCK_SIZE = 1024; /** The in-memory data structure used to approximate intervals.. */ final protected IntervalApproximator intervalApproximator; /** The block size of this dictionary (in bits). */ final protected long blockSize; /** A decoder used to read data from the dump stream. */ final protected Decoder decoder; /** A map (given by an array) from symbols in the coder to characters. */ final protected char[] symbol2char; /** A map from characters to symbols of the coder. */ final protected Char2IntOpenHashMap char2symbol; /** The number of terms in this dictionary. */ final protected int size; /** The index of the first word in each block, plus an additional entry containing {@link #size}. */ final private int[] blockStart; /** An array parallel to {@link #blockStart} giving the offset in blocks in the dump file * of the corresponding word in {@link #blockStart}. If there are no overflows, this will just * be an initial segment of the natural numbers, but overflows cause jumps. */ final private int[] blockOffset; /** Whether this dictionary is self-contained. */ final private boolean selfContained; /** The length in bytes of the dump stream, both for serialisation purposes and for minimal checks. */ final private long dumpStreamLength; /** The filename of the temporary dump stream, or of the dump stream created by the constructor or by readObject(). */ private transient String tempDumpStreamFilename; /** If true, the creation of the last {@link ImmutableExternalPrefixDictionary.DumpStreamIterator} was not * followed by a call to any get method. */ private transient boolean iteratorIsUsable; /** A reference to the dump stream. */ private transient InputBitStream dumpStream; /** Creates an external dictionary. * * <P>This constructor does not assume that strings returned by <code>terms.iterator()</code> * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}. * * @param terms an iterable whose iterator will enumerate in lexicographical order the terms for the dictionary. * @param blockSizeInBytes the block size (in bytes). * @param dumpStreamFilename the name of the dump stream, or <code>null</code> for a dictionary * with an automatic dump stream. */ public ImmutableExternalPrefixDictionary( final Iterable<? extends CharSequence> terms, final int blockSizeInBytes, final CharSequence dumpStreamFilename ) throws IOException { this.blockSize = blockSizeInBytes * 8; this.selfContained = dumpStreamFilename == null; // First of all, we gather frequencies for all Unicode characters int[] frequency = new int[ Character.MAX_VALUE + 1 ]; int maxWordLength = 0; CharSequence s; int count = 0; final MutableString prevTerm = new MutableString(); for( Iterator<? extends CharSequence> i = terms.iterator(); i.hasNext(); ) { s = i.next(); maxWordLength = Math.max( s.length(), maxWordLength ); for( int j = s.length(); j-- != 0; ) frequency[ s.charAt( j ) ]++; if ( count > 0 && prevTerm.compareTo( s ) >= 0 ) throw new IllegalArgumentException( "The provided term collection is not sorted, or contains duplicates [" + prevTerm + ", " + s + "]" ); count++; prevTerm.replace( s ); } size = count; // Then, we compute the number of actually used characters count = 0; for( int i = frequency.length; i-- != 0; ) if ( frequency[ i ] != 0 ) count++; /* Now we remap used characters in f, building at the same time maps from * symbol to characters and from characters to symbols. */ int[] packedFrequency = new int[ count ]; symbol2char = new char[ count ]; char2symbol = new Char2IntOpenHashMap( count ); char2symbol.defaultReturnValue( -1 ); for( int i = frequency.length, k = count; i-- != 0; ) { if ( frequency[ i ] != 0 ) { packedFrequency[ --k ] = frequency[ i ]; symbol2char[ k ] = (char)i; char2symbol.put( (char)i, k ); } } char2symbol.trim(); // We now build the coder used to code the strings final PrefixCoder prefixCoder;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -