⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 signedminimalperfecthash.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.util;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2002-2007 Sebastiano Vigna   * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.io.FileLinesCollection;import java.io.IOException;import java.io.Serializable;import java.lang.reflect.InvocationTargetException;import java.util.zip.GZIPInputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.UnflaggedOption;/** Signed order-preserving minimal perfect hash tables. * * <P>Minimal perfect hash tables will always return a result, even for terms that were * not present in the collection indexed by the table. Sometimes you may prefer to single out * terms that were not present in the collection. * * <P>To this purpose, MG4J provides <em>signed</em> minimal perfect tables. In a signed * table, every term in the collection gets a <em>signature</em> that is used to  * tell false positives. Signature may go from the simple hashcode-based signatures * provided by {@link HashCodeSignedMinimalPerfectHash} class, to sophisticated * cryptographic signatures, to (at the other extreme) a class that actually stores * the terms (and thus completely avoids false positives) such as {@link LiterallySignedMinimalPerfectHash}. * * <P>A signed table extends this class, and provides two methods: a  * {@link #initSignatures(Iterable)} method that sets up the necessary data * structures, and a {@link #checkSignature(CharSequence,int)} method that * checks a given character sequence against the signature stored for * a term having given index. * * <P>It is good practise, of course, to replicate the constructors of this * class in all implementing subclasses (by simply invoking <code>super</code> * with the same arguments). Moreover, to be useful classes implementing this * class <em>must</em> be serialisable. * * @author Sebastiano Vigna * @author Marco Olivo * @since 0.4 * @deprecated Use the new hashing stuff in Sux4J. */@Deprecatedpublic abstract class SignedMinimalPerfectHash extends MinimalPerfectHash implements Serializable {	private static final Logger LOGGER = Util.getLogger( MinimalPerfectHash.class );	public static final long serialVersionUID = -7046029254386353130L;	/** Creates a new signed order-preserving minimal perfect hash table for the given	 * terms, using as many weights as the longest term in the collection.	 *	 * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this	 * constructor will invoke {@link #initSignatures(Iterable)}.	 *	 * @param terms some terms to hash; it is assumed that they do not contain duplicates.	 * @see MinimalPerfectHash#MinimalPerfectHash(Iterable)	 */	public SignedMinimalPerfectHash( final Iterable<? extends CharSequence> terms ) {		super( terms );		initSignatures( terms );	}	/** Creates a new signed order-preserving minimal perfect hash table	 *  for the given terms using the given number of weights.	 *	 * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this	 * constructor will invoke {@link #initSignatures(Iterable)}.	 *	 * @param terms some terms to hash; it is assumed that no terms share a common prefix of	 * <code>weightLength</code> characters.	 * @param weightLength the number of weights used generating the intermediate hash functions.	 * @see MinimalPerfectHash#MinimalPerfectHash(Iterable, int)	 */	public SignedMinimalPerfectHash( final Iterable<? extends CharSequence> terms, final int weightLength ) {	    super( terms, weightLength );		initSignatures( terms );	}	/** Creates a new signed order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file 	 * of terms using the given number of weights.	 *	 * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this	 * constructor will invoke {@link #initSignatures(Iterable)}.	 *	 * @param termFile a file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @param weightLength the number of weights used generating the	 * intermediate hash functions.	 * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}.	 * @see MinimalPerfectHash#MinimalPerfectHash(String,String,int,boolean)	 */	public SignedMinimalPerfectHash( final String termFile, final String encoding, final int weightLength, boolean zipped ) {		super( termFile, encoding, weightLength, zipped );		initSignatures( new FileLinesCollection( termFile, encoding, zipped ) );	}	/** Creates a new signed order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file 	 * of terms.	 *	 * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this	 * constructor will invoke {@link #initSignatures(Iterable)}.	 *	 * @param termFile a file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}.	 * @see MinimalPerfectHash#MinimalPerfectHash(String,String,boolean)	 */	public SignedMinimalPerfectHash( final String termFile, final String encoding, boolean zipped ) {		super( termFile, encoding, zipped );		initSignatures( new FileLinesCollection( termFile, encoding, zipped ) );	}		/** Creates a new signed order-preserving minimal perfect hash table for the given file 	 * of terms using the given number of weights.	 *	 * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this	 * constructor will invoke {@link #initSignatures(Iterable)}.	 *	 * @param termFile a file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @param weightLength the number of weights used generating the	 * intermediate hash functions.	 * @see MinimalPerfectHash#MinimalPerfectHash(String,String,int)	 */	public SignedMinimalPerfectHash( final String termFile, final String encoding, final int weightLength) {		super( termFile, encoding, weightLength );		initSignatures( new FileLinesCollection( termFile, encoding ) );	}	/** Creates a new signed order-preserving minimal perfect hash table for the given file 	 * of terms.	 *	 * <P>After calling the corresponding constructor of {@link MinimalPerfectHash}, this	 * constructor will invoke {@link #initSignatures(Iterable)}.	 *	 * @param termFile a file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @see MinimalPerfectHash#MinimalPerfectHash(String,String)	 */	public SignedMinimalPerfectHash( final String termFile, final String encoding ) {		super( termFile, encoding );		initSignatures( new FileLinesCollection( termFile, encoding ) );	}	/** Hashes a given term.	 *	 * @param term a term to hash.	 * @return the position of the given term in the generating collection, starting from 0, if the	 * term was in the original collection; otherwise, -1.	 */	public int getNumber( final CharSequence term ) {		final int i = super.getNumber( term );		if ( checkSignature( term, i ) ) return i;		else return -1;	}	/** Hashes a given term.	 *	 * @param term a term to hash.	 * @return the position of the given term in the generating collection, starting from 0, if the	 * term was in the original collection; otherwise, -1.	 */	public int getNumber( final MutableString term ) {		final int i = super.getNumber( term );		if ( checkSignature( term, i ) ) return i;		else return -1;	}	/** Hashes a term given as a byte-array fragment interpreted in the ISO-8859-1 charset encoding.	 *	 * @param a a byte array.	 * @param off the first valid byte in <code>a</code>.	 * @param len the number of bytes composing the term, starting at <code>off</code>.	 * @return the position of term defined by <code>len</code> bytes starting at <code>off</code> (interpreted	 * as ISO-8859-1 characters) in the generating collection, starting from 0, if the	 * term was in the original collection; otherwise, -1.	 */	public int getNumber( final byte[] a, final int off, final int len ) {		final int i = super.getNumber( a, off, len );		if ( checkSignature( a, off, len, i ) ) return i;		else return -1;	}	/** Sets up the signature system from a collection.	 *	 * <P>This abstract method must be overriden by implementing subclasses.	 * It must set up all data structures that are necessary to handle	 * signatures; in particular, it will usually compute signatures for	 * all terms in the given collection.	 *	 * @param terms the collection of terms given to the constructor of this class.	 * @see HashCodeSignedMinimalPerfectHash#initSignatures(Iterable)	 * @see LiterallySignedMinimalPerfectHash#initSignatures(Iterable)	 */	protected abstract void initSignatures( Iterable<? extends CharSequence> terms ) ;	/** Checks a signature against a character sequence.	 *	 * <P>This abstract method must be overriden by implementing subclasses.	 * It must check whether the signature of the given character sequence matches	 * the one stored for the <code>index</code>-th term.	 *	 * <P>Note that this method and {@link #checkSignature(byte[], int, int, int)} <strong>must</strong>	 * be coherent.	 * 	 * @param term a character sequence.	 * @param index an integer denoting a term in the indexed collection.	 * @return true iff the signature of the given character sequence matches	 * the one stored for the <code>index</code>-th term.	 * @see HashCodeSignedMinimalPerfectHash#checkSignature(CharSequence, int)	 * @see LiterallySignedMinimalPerfectHash#checkSignature(CharSequence,int)	 */	protected abstract boolean checkSignature( CharSequence term, int index );	/** Checks a signature against a byte-array fragment.	 *	 * <P>This abstract method must be overriden by implementing subclasses.	 * It must check whether the signature of the given byte-array fragment 	 * (interpreted as an ISO-8859-1 string) matches	 * the one stored for the <code>index</code>-th term.	 *	 * <P>Note that this method and {@link #checkSignature(CharSequence, int)} <strong>must</strong>	 * be coherent.	 * 	 * @param a a byte array.	 * @param off the first valid byte in <code>a</code>.	 * @param len the number of bytes composing the term, starting at <code>off</code>.	 * @return true if the signature stored for the term defined by <code>len</code> bytes starting at <code>off</code> (interpreted	 * as ISO-8859-1 characters) matches the one stored for the <code>index</code>-th term.	 * @see HashCodeSignedMinimalPerfectHash#checkSignature(CharSequence, int)	 * @see LiterallySignedMinimalPerfectHash#checkSignature(CharSequence,int)	 */	protected abstract boolean checkSignature( byte[] a, int off, int len, int index );		/** Returns a unsigned view of this signed minimal perfect hash.	 * 	 * @return an unsigned view of this minimal perfect hash.	 */	public MinimalPerfectHash asUnsigned() {		return new MinimalPerfectHash( this );	}	    public static void main( final String[] arg ) throws InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, IOException, JSAPException, ClassNotFoundException {		final SimpleJSAP jsap = new SimpleJSAP( SignedMinimalPerfectHash.class.getName(), "Builds an unsigned order-preserving minimal perfect hash from a signed one.",				new Parameter[] {					new UnflaggedOption( "smph", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the input serialised signed minimal perfect hash table." ),					new UnflaggedOption( "mph", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the output serialised unsigned minimal perfect hash table." )		});				JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;				LOGGER.info( "Reading signed map..." );				final SignedMinimalPerfectHash smph = (SignedMinimalPerfectHash) BinIO.loadObject( jsapResult.getString( "smph" ) );		LOGGER.info( "Writing unsigned map to file..." );				BinIO.storeObject( smph.asUnsigned(), jsapResult.getString( "mph" ) );		LOGGER.info( "Completed." );    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -