📄 minimalperfecthash.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
		if ( h0 == h1 || h1 == h2 ) h1 = m + h1 % (NODE_OVERHEAD / 2 );		if ( h0 == h2 ) h2 = m + NODE_OVERHEAD / 2 + h2 % ( NODE_OVERHEAD / 2 );		return (int)( ( (long)g[ h0 ] + g[ h1 ] + g[ h2 ] + n4 ) % n );	}	/** Hashes a term given as a byte-array fragment interpreted in the ISO-8859-1 charset encoding.	 *	 * @param a a byte array.	 * @param off the first valid byte in <code>a</code>.	 * @param len the number of bytes composing the term, starting at <code>off</code>.	 * @return the position of term defined by <code>len</code> bytes starting at <code>off</code> (interpreted	 * as ISO-8859-1 characters) in the generating collection, starting from 0. If the	 * term was not in the original collection, the result is a random position.	 */	public int getNumber( final byte[] a, final int off, final int len ) {		if ( t != null )			try {				return getFromT( new String( a, off, len, "ISO-8859-1" ) );			}			catch ( UnsupportedEncodingException cantHappen ) {				throw new RuntimeException( cantHappen );			}		int h0 = init[ 0 ], h1 = init[ 1 ], h2 = init[ 2 ]; // We need three these values to map correctly the empty string		int c;		int i = len;		if ( weightLength < i ) i = weightLength;		while( i-- != 0 ) {			c = a[ off + i ] & 0xFF;			h0 ^= weight0[ i ] * c;			h1 ^= weight1[ i ] * c;			h2 ^= weight2[ i ] * c;		}		h0 = ( h0 >>> rightShift ) % m;		h1 = ( h1 >>> rightShift ) % m;		h2 = ( h2 >>> rightShift ) % m;		// Deterministic fix: we move h1 or h2 into the overhead space if necessary		if ( h0 == h1 || h1 == h2 ) h1 = m + h1 % (NODE_OVERHEAD / 2 );		if ( h0 == h2 ) h2 = m + NODE_OVERHEAD / 2 + h2 % ( NODE_OVERHEAD / 2 );		return (int)( ( (long)g[ h0 ] + g[ h1 ] + g[ h2 ] + n4 ) % n );	}	/** Hashes a term given as a byte array interpreted in the ISO-8859-1 charset encoding.	 *	 * @param a a byte array.	 * @return the position of term defined by the bytes in a <code>a</code> (interpreted	 * as ISO-8859-1 characters) in the generating collection, starting from 0. If the	 * term was not in the original collection, the result is a random position.	 */	public int getNumber( final byte[] a ) {		return getNumber( a, 0, a.length );	}			/** Gets a term out of the stored array {@link #t}. 	 * 	 * <P>Note: This method does not check for {@link #t} being non-<code>null</code>.	 * 	 * @param term a term.	 * @return the position of the given term in the generating collection, starting from 0. If the	 * term was not in the original collection, the result is 0.	 */	protected int getFromT( final CharSequence term ) {		int i = n;		/* Note that we invoke equality *on the stored MutableString*. This		 * is essential due to the known weaknesses in CharSequence's contract. */		while( i-- != 0 ) if ( t[ i ].equals( term ) ) return i; 		return 0;	}	/** Returns the length of the weight vectors.	 *	 * @return the length of weight vectors used in this table.	 */	public int weightLength() {		return weightLength;	}	/** Returns the number of terms hashed.	 *	 * @return the number of terms hashed.	 */	public int size() {		return n;	}	public boolean hasTerms() {		return false;	}	/** Creates a new order-preserving minimal perfect hash table for the given	 * terms, using as many weights as the longest term in the collection.	 *	 * <P><strong>Caution:</strong> if the collection contains twice the same	 * term, this constructor will never return.	 *	 * @param terms some terms to hash; it is assumed that there are no duplicates.	 */	public MinimalPerfectHash( final Iterable<? extends CharSequence> terms ) {		this( terms, WEIGHT_UNKNOWN );	}	/** Creates a new order-preserving minimal perfect hash table for the 	 * given terms using the given number of weights.	 *	 * <P>This constructor can be used to force the use of a reduced number of weights if you are sure that	 * the first <code>weightLength</code> characters of each term in <code>terms</code> are	 * distinct.	 *	 * <P>If you do not know your weight length in advance, you can pass two	 * special values as <code>weightLength</code>: {@link #WEIGHT_UNKNOWN}	 * will force the computation of <code>weightLength</code> as the maximum	 * length of a term, whereas {@link #WEIGHT_UNKNOWN_SORTED_TERMS} forces	 * the assumption that terms are sorted: in this case, we search for	 * the minimum prefix that will disambiguate all terms in the collection	 * (a shorter prefix yields faster lookups). 	 * 	 * <P><strong>Caution:</strong> if two terms in the collection have a	 * common prefix of length <code>weightLength</code> this constructor will	 * never return.	 * 	 * @param terms some terms to hash; if <code>weightLength</code>	 * is specified explicitly, it is assumed that there are no terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param weightLength the number of weights used generating the	 * intermediate hash functions, {@link #WEIGHT_UNKNOWN} or {@link #WEIGHT_UNKNOWN_SORTED_TERMS}.	 * @see #MinimalPerfectHash(Iterable)	 */	@SuppressWarnings("unused") // TODO: move it to the first for loop when javac has been fixed	public MinimalPerfectHash( final Iterable<? extends CharSequence> terms, int weightLength ) {		if ( weightLength != WEIGHT_UNKNOWN && weightLength != WEIGHT_UNKNOWN_SORTED_TERMS && weightLength <= 0 ) throw new IllegalArgumentException( "Non-positive weight length: " + weightLength );		// First of all we compute the size, either by size(), if possible, or simply by iterating.		if ( terms instanceof Collection) n = ((Collection<? extends CharSequence>)terms).size();		else {			int c = 0;			// Do not add a suppression annotation--it breaks javac			for( CharSequence o: terms ) c++;			n = c;		}				n4 = n * 4;		m = (int)Math.ceil( n * ENLARGEMENT_FACTOR );		rightShift = ( Integer.SIZE - Fast.mostSignificantBit( m ) ) / 2;		g = new int[ m + NODE_OVERHEAD ];		if ( weightLength < 0 ) {			LOGGER.info( "Computing weight length..." );			Iterator<? extends CharSequence> i = terms.iterator();			int maxLength = Integer.MIN_VALUE, minPrefixLength = Integer.MIN_VALUE;						if ( i.hasNext() ) {				CharSequence currTerm;				MutableString prevTerm = new MutableString( i.next() );  // We cannot assume that the string will persist after a call to next().				int k, prevLength = prevTerm.length(), currLength;					maxLength = prevLength;				while( i.hasNext() ) {					currTerm = i.next();					currLength = currTerm.length();					if ( weightLength == WEIGHT_UNKNOWN_SORTED_TERMS ) {						for( k = 0; k < prevLength && k < currLength && currTerm.charAt( k ) == prevTerm.charAt( k ); k++ );								if ( k == currLength && k == prevLength ) throw new IllegalArgumentException( "The term list contains a duplicate (" + currTerm + ")" );						minPrefixLength = Math.max( minPrefixLength, k + 1 );						prevTerm.replace( currTerm ); // We cannot assume that the string will persist after a call to next().						prevLength = currLength;					}					maxLength = Math.max( maxLength, currLength );				}			}			weightLength = weightLength == WEIGHT_UNKNOWN_SORTED_TERMS ? minPrefixLength : maxLength;			LOGGER.info( "Completed [max term length=" + maxLength + "; weight length=" + weightLength + "]." );		}		if ( weightLength < 0 ) weightLength = 0;		weight0 = new int[ weightLength ];		weight1 = new int[ weightLength ];		weight2 = new int[ weightLength ];		init = new int[ 3 ];		this.weightLength = weightLength;		if ( n < TERM_THRESHOLD ) {			int j = 0;			t = new MutableString[ n ];			for( Iterator<? extends CharSequence> i = terms.iterator(); i.hasNext(); ) t[ j++ ] = new MutableString( i.next() ); 		}		else {					t = null;			new Visit( terms );		}	}	/** Creates a new order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file 	 * of terms using the given number of weights.	 *	 * @param termFile an file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @param weightLength the number of weights used generating the	 * intermediate hash functions, {@link #WEIGHT_UNKNOWN} or {@link #WEIGHT_UNKNOWN_SORTED_TERMS}.	 * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}.	 * @see #MinimalPerfectHash(Iterable, int)	 */	public MinimalPerfectHash( final String termFile, final String encoding, int weightLength, boolean zipped ) {		this( new FileLinesCollection( termFile, encoding, zipped ), weightLength );	}	/** Creates a new order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file 	 * of terms.	 *	 * @param termFile an file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}.	 * @see #MinimalPerfectHash(Iterable, int)	 */	public MinimalPerfectHash( final String termFile, final String encoding, boolean zipped ) {		this( termFile, encoding, WEIGHT_UNKNOWN, zipped );	}	/** Creates a new order-preserving minimal perfect hash table for the given file 	 * of terms using the given number of weights.	 *	 * @param termFile an file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @param weightLength the number of weights used generating the	 * intermediate hash functions, {@link #WEIGHT_UNKNOWN} or {@link #WEIGHT_UNKNOWN_SORTED_TERMS}.	 * @see #MinimalPerfectHash(Iterable, int)	 */	public MinimalPerfectHash( final String termFile, final String encoding, int weightLength ) {		this( new FileLinesCollection( termFile, encoding ), weightLength );	}	/** Creates a new order-preserving minimal perfect hash table for the given file 	 * of terms.	 *	 * @param termFile an file containing one term on each line; it is assumed that 	 * it does not contain terms with a common prefix of	 * <code>weightLength</code> characters.	 * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it	 * is assumed to be the platform default encoding.	 * @see #MinimalPerfectHash(Iterable, int)	 */	public MinimalPerfectHash( final String termFile, final String encoding ) {		this( termFile, encoding, WEIGHT_UNKNOWN, false );	}		/** Creates a new minimal perfect hash by copying a given one; non-transient fields are (shallow) copied.	 * 	 * @param mph the perfect hash to be copied.	 */	protected MinimalPerfectHash( final MinimalPerfectHash mph ) {		this.n = mph.n;		this.m = mph.m;		this.weightLength = mph.weightLength;		this.weight0 = mph.weight0;		this.weight1 = mph.weight1;		this.weight2 = mph.weight2;		this.init = mph.init;		this.g = mph.g;		this.rightShift = mph.rightShift;		this.n4 = mph.n4;		this.t = mph.t;	}	/** The internal state of a visit. */		private class Visit {		/** {@link #m} plus 2. */		final int mPlusOverhead = m + NODE_OVERHEAD;		/** An 3&times;n array recording the triple of vertices involved in each hyperedge. It is *reversed*		   w.r.t. what you would expect to reduce object creation. */		final int[][] edge = new int[ 3 ][ n ];		/** Whether a hyperedge has been already removed. */		final boolean[] removed = new boolean[ n ];		/** For each vertex of the intermediate hypergraph, the vector of incident hyperedges. */		final int[] inc = new int[ mPlusOverhead * 3 ];		/** The next position to fill in the respective incidence vector. 		 * Used also to store the hyperedge permutation and speed up permute(). */		final int[] last = new int[ mPlusOverhead ]; 		/** For each vertex of the intermediate hypergraph, the offset into 		 * the vector of incident hyperedges. Used also to speed up permute(). */		final int[] incOffset = new int[ mPlusOverhead ];		/** The hyperedge stack. Used also to invert the hyperedge permutation. */		final int[] stack = new int[ n ];		/** The degree of each vertex of the intermediate hypergraph. */		final int[] d = new int[ mPlusOverhead ];		/** Initial top of the hyperedge stack. */		int top;		public Visit( final Iterable<? extends CharSequence> terms ) {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -