📄 signedminimalperfecthash.java
字号:
package it.unimi.dsi.mg4j.util;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2002-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.io.FileLinesCollection;import java.io.IOException;import java.io.Serializable;import java.lang.reflect.InvocationTargetException;import java.util.zip.GZIPInputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.UnflaggedOption;/** Signed order-preserving minimal perfect hash tables. * * <P>Minimal perfect hash tables will always return a result, even for terms that were * not present in the collection indexed by the table. Sometimes you may prefer to single out * terms that were not present in the collection. * * <P>To this purpose, MG4J provides <em>signed</em> minimal perfect tables. In a signed * table, every term in the collection gets a <em>signature</em> that is used to * tell false positives. Signature may go from the simple hashcode-based signatures * provided by {@link HashCodeSignedMinimalPerfectHash} class, to sophisticated * cryptographic signatures, to (at the other extreme) a class that actually stores * the terms (and thus completely avoids false positives) such as {@link LiterallySignedMinimalPerfectHash}. * * <P>A signed table extends this class, and provides two methods: a * {@link #initSignatures(Iterable)} method that sets up the necessary data * structures, and a {@link #checkSignature(CharSequence,int)} method that * checks a given character sequence against the signature stored for * a term having given index. * * <P>It is good practise, of course, to replicate the constructors of this * class in all implementing subclasses (by simply invoking <code>super</code> * with the same arguments). Moreover, to be useful classes implementing this * class <em>must</em> be serialisable. * * @author Sebastiano Vigna * @author Marco Olivo * @since 0.4 * @deprecated Use the new hashing stuff in Sux4J. */@Deprecatedpublic abstract class SignedMinimalPerfectHash extends MinimalPerfectHash implements Serializable { private static final Logger LOGGER = Util.getLogger( MinimalPerfectHash.class ); public static final long serialVersionUID = -7046029254386353130L; /** Creates a new signed order-preserving minimal perfect hash table for the given * terms, using as many weights as the longest term in the collection. * * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this * constructor will invoke {@link #initSignatures(Iterable)}. * * @param terms some terms to hash; it is assumed that they do not contain duplicates. * @see MinimalPerfectHash#MinimalPerfectHash(Iterable) */ public SignedMinimalPerfectHash( final Iterable<? extends CharSequence> terms ) { super( terms ); initSignatures( terms ); } /** Creates a new signed order-preserving minimal perfect hash table * for the given terms using the given number of weights. * * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this * constructor will invoke {@link #initSignatures(Iterable)}. * * @param terms some terms to hash; it is assumed that no terms share a common prefix of * <code>weightLength</code> characters. * @param weightLength the number of weights used generating the intermediate hash functions. * @see MinimalPerfectHash#MinimalPerfectHash(Iterable, int) */ public SignedMinimalPerfectHash( final Iterable<? extends CharSequence> terms, final int weightLength ) { super( terms, weightLength ); initSignatures( terms ); } /** Creates a new signed order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file * of terms using the given number of weights. * * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this * constructor will invoke {@link #initSignatures(Iterable)}. * * @param termFile a file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @param weightLength the number of weights used generating the * intermediate hash functions. * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}. * @see MinimalPerfectHash#MinimalPerfectHash(String,String,int,boolean) */ public SignedMinimalPerfectHash( final String termFile, final String encoding, final int weightLength, boolean zipped ) { super( termFile, encoding, weightLength, zipped ); initSignatures( new FileLinesCollection( termFile, encoding, zipped ) ); } /** Creates a new signed order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file * of terms. * * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this * constructor will invoke {@link #initSignatures(Iterable)}. * * @param termFile a file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}. * @see MinimalPerfectHash#MinimalPerfectHash(String,String,boolean) */ public SignedMinimalPerfectHash( final String termFile, final String encoding, boolean zipped ) { super( termFile, encoding, zipped ); initSignatures( new FileLinesCollection( termFile, encoding, zipped ) ); } /** Creates a new signed order-preserving minimal perfect hash table for the given file * of terms using the given number of weights. * * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this * constructor will invoke {@link #initSignatures(Iterable)}. * * @param termFile a file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @param weightLength the number of weights used generating the * intermediate hash functions. * @see MinimalPerfectHash#MinimalPerfectHash(String,String,int) */ public SignedMinimalPerfectHash( final String termFile, final String encoding, final int weightLength) { super( termFile, encoding, weightLength ); initSignatures( new FileLinesCollection( termFile, encoding ) ); } /** Creates a new signed order-preserving minimal perfect hash table for the given file * of terms. * * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this * constructor will invoke {@link #initSignatures(Iterable)}. * * @param termFile a file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @see MinimalPerfectHash#MinimalPerfectHash(String,String) */ public SignedMinimalPerfectHash( final String termFile, final String encoding ) { super( termFile, encoding ); initSignatures( new FileLinesCollection( termFile, encoding ) ); } /** Hashes a given term. * * @param term a term to hash. * @return the position of the given term in the generating collection, starting from 0, if the * term was in the original collection; otherwise, -1. */ public int getNumber( final CharSequence term ) { final int i = super.getNumber( term ); if ( checkSignature( term, i ) ) return i; else return -1; } /** Hashes a given term. * * @param term a term to hash. * @return the position of the given term in the generating collection, starting from 0, if the * term was in the original collection; otherwise, -1. */ public int getNumber( final MutableString term ) { final int i = super.getNumber( term ); if ( checkSignature( term, i ) ) return i; else return -1; } /** Hashes a term given as a byte-array fragment interpreted in the ISO-8859-1 charset encoding. * * @param a a byte array. * @param off the first valid byte in <code>a</code>. * @param len the number of bytes composing the term, starting at <code>off</code>. * @return the position of term defined by <code>len</code> bytes starting at <code>off</code> (interpreted * as ISO-8859-1 characters) in the generating collection, starting from 0, if the * term was in the original collection; otherwise, -1. */ public int getNumber( final byte[] a, final int off, final int len ) { final int i = super.getNumber( a, off, len ); if ( checkSignature( a, off, len, i ) ) return i; else return -1; } /** Sets up the signature system from a collection. * * <P>This abstract method must be overriden by implementing subclasses. * It must set up all data structures that are necessary to handle * signatures; in particular, it will usually compute signatures for * all terms in the given collection. * * @param terms the collection of terms given to the constructor of this class. * @see HashCodeSignedMinimalPerfectHash#initSignatures(Iterable) * @see LiterallySignedMinimalPerfectHash#initSignatures(Iterable) */ protected abstract void initSignatures( Iterable<? extends CharSequence> terms ) ; /** Checks a signature against a character sequence. * * <P>This abstract method must be overriden by implementing subclasses. * It must check whether the signature of the given character sequence matches * the one stored for the <code>index</code>-th term. * * <P>Note that this method and {@link #checkSignature(byte[], int, int, int)} <strong>must</strong> * be coherent. * * @param term a character sequence. * @param index an integer denoting a term in the indexed collection. * @return true iff the signature of the given character sequence matches * the one stored for the <code>index</code>-th term. * @see HashCodeSignedMinimalPerfectHash#checkSignature(CharSequence, int) * @see LiterallySignedMinimalPerfectHash#checkSignature(CharSequence,int) */ protected abstract boolean checkSignature( CharSequence term, int index ); /** Checks a signature against a byte-array fragment. * * <P>This abstract method must be overriden by implementing subclasses. * It must check whether the signature of the given byte-array fragment * (interpreted as an ISO-8859-1 string) matches * the one stored for the <code>index</code>-th term. * * <P>Note that this method and {@link #checkSignature(CharSequence, int)} <strong>must</strong> * be coherent. * * @param a a byte array. * @param off the first valid byte in <code>a</code>. * @param len the number of bytes composing the term, starting at <code>off</code>. * @return true if the signature stored for the term defined by <code>len</code> bytes starting at <code>off</code> (interpreted * as ISO-8859-1 characters) matches the one stored for the <code>index</code>-th term. * @see HashCodeSignedMinimalPerfectHash#checkSignature(CharSequence, int) * @see LiterallySignedMinimalPerfectHash#checkSignature(CharSequence,int) */ protected abstract boolean checkSignature( byte[] a, int off, int len, int index ); /** Returns a unsigned view of this signed minimal perfect hash. * * @return an unsigned view of this minimal perfect hash. */ public MinimalPerfectHash asUnsigned() { return new MinimalPerfectHash( this ); } public static void main( final String[] arg ) throws InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, IOException, JSAPException, ClassNotFoundException { final SimpleJSAP jsap = new SimpleJSAP( SignedMinimalPerfectHash.class.getName(), "Builds an unsigned order-preserving minimal perfect hash from a signed one.", new Parameter[] { new UnflaggedOption( "smph", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the input serialised signed minimal perfect hash table." ), new UnflaggedOption( "mph", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the output serialised unsigned minimal perfect hash table." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; LOGGER.info( "Reading signed map..." ); final SignedMinimalPerfectHash smph = (SignedMinimalPerfectHash) BinIO.loadObject( jsapResult.getString( "smph" ) ); LOGGER.info( "Writing unsigned map to file..." ); BinIO.storeObject( smph.asUnsigned(), jsapResult.getString( "mph" ) ); LOGGER.info( "Completed." ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -