📄 minimalperfecthash.java
字号:
if ( h0 == h1 || h1 == h2 ) h1 = m + h1 % (NODE_OVERHEAD / 2 ); if ( h0 == h2 ) h2 = m + NODE_OVERHEAD / 2 + h2 % ( NODE_OVERHEAD / 2 ); return (int)( ( (long)g[ h0 ] + g[ h1 ] + g[ h2 ] + n4 ) % n ); } /** Hashes a term given as a byte-array fragment interpreted in the ISO-8859-1 charset encoding. * * @param a a byte array. * @param off the first valid byte in <code>a</code>. * @param len the number of bytes composing the term, starting at <code>off</code>. * @return the position of term defined by <code>len</code> bytes starting at <code>off</code> (interpreted * as ISO-8859-1 characters) in the generating collection, starting from 0. If the * term was not in the original collection, the result is a random position. */ public int getNumber( final byte[] a, final int off, final int len ) { if ( t != null ) try { return getFromT( new String( a, off, len, "ISO-8859-1" ) ); } catch ( UnsupportedEncodingException cantHappen ) { throw new RuntimeException( cantHappen ); } int h0 = init[ 0 ], h1 = init[ 1 ], h2 = init[ 2 ]; // We need three these values to map correctly the empty string int c; int i = len; if ( weightLength < i ) i = weightLength; while( i-- != 0 ) { c = a[ off + i ] & 0xFF; h0 ^= weight0[ i ] * c; h1 ^= weight1[ i ] * c; h2 ^= weight2[ i ] * c; } h0 = ( h0 >>> rightShift ) % m; h1 = ( h1 >>> rightShift ) % m; h2 = ( h2 >>> rightShift ) % m; // Deterministic fix: we move h1 or h2 into the overhead space if necessary if ( h0 == h1 || h1 == h2 ) h1 = m + h1 % (NODE_OVERHEAD / 2 ); if ( h0 == h2 ) h2 = m + NODE_OVERHEAD / 2 + h2 % ( NODE_OVERHEAD / 2 ); return (int)( ( (long)g[ h0 ] + g[ h1 ] + g[ h2 ] + n4 ) % n ); } /** Hashes a term given as a byte array interpreted in the ISO-8859-1 charset encoding. * * @param a a byte array. * @return the position of term defined by the bytes in a <code>a</code> (interpreted * as ISO-8859-1 characters) in the generating collection, starting from 0. If the * term was not in the original collection, the result is a random position. */ public int getNumber( final byte[] a ) { return getNumber( a, 0, a.length ); } /** Gets a term out of the stored array {@link #t}. * * <P>Note: This method does not check for {@link #t} being non-<code>null</code>. * * @param term a term. * @return the position of the given term in the generating collection, starting from 0. If the * term was not in the original collection, the result is 0. */ protected int getFromT( final CharSequence term ) { int i = n; /* Note that we invoke equality *on the stored MutableString*. This * is essential due to the known weaknesses in CharSequence's contract. */ while( i-- != 0 ) if ( t[ i ].equals( term ) ) return i; return 0; } /** Returns the length of the weight vectors. * * @return the length of weight vectors used in this table. */ public int weightLength() { return weightLength; } /** Returns the number of terms hashed. * * @return the number of terms hashed. */ public int size() { return n; } public boolean hasTerms() { return false; } /** Creates a new order-preserving minimal perfect hash table for the given * terms, using as many weights as the longest term in the collection. * * <P><strong>Caution:</strong> if the collection contains twice the same * term, this constructor will never return. * * @param terms some terms to hash; it is assumed that there are no duplicates. */ public MinimalPerfectHash( final Iterable<? extends CharSequence> terms ) { this( terms, WEIGHT_UNKNOWN ); } /** Creates a new order-preserving minimal perfect hash table for the * given terms using the given number of weights. * * <P>This constructor can be used to force the use of a reduced number of weights if you are sure that * the first <code>weightLength</code> characters of each term in <code>terms</code> are * distinct. * * <P>If you do not know your weight length in advance, you can pass two * special values as <code>weightLength</code>: {@link #WEIGHT_UNKNOWN} * will force the computation of <code>weightLength</code> as the maximum * length of a term, whereas {@link #WEIGHT_UNKNOWN_SORTED_TERMS} forces * the assumption that terms are sorted: in this case, we search for * the minimum prefix that will disambiguate all terms in the collection * (a shorter prefix yields faster lookups). * * <P><strong>Caution:</strong> if two terms in the collection have a * common prefix of length <code>weightLength</code> this constructor will * never return. * * @param terms some terms to hash; if <code>weightLength</code> * is specified explicitly, it is assumed that there are no terms with a common prefix of * <code>weightLength</code> characters. * @param weightLength the number of weights used generating the * intermediate hash functions, {@link #WEIGHT_UNKNOWN} or {@link #WEIGHT_UNKNOWN_SORTED_TERMS}. * @see #MinimalPerfectHash(Iterable) */ @SuppressWarnings("unused") // TODO: move it to the first for loop when javac has been fixed public MinimalPerfectHash( final Iterable<? extends CharSequence> terms, int weightLength ) { if ( weightLength != WEIGHT_UNKNOWN && weightLength != WEIGHT_UNKNOWN_SORTED_TERMS && weightLength <= 0 ) throw new IllegalArgumentException( "Non-positive weight length: " + weightLength ); // First of all we compute the size, either by size(), if possible, or simply by iterating. if ( terms instanceof Collection) n = ((Collection<? extends CharSequence>)terms).size(); else { int c = 0; // Do not add a suppression annotation--it breaks javac for( CharSequence o: terms ) c++; n = c; } n4 = n * 4; m = (int)Math.ceil( n * ENLARGEMENT_FACTOR ); rightShift = ( Integer.SIZE - Fast.mostSignificantBit( m ) ) / 2; g = new int[ m + NODE_OVERHEAD ]; if ( weightLength < 0 ) { LOGGER.info( "Computing weight length..." ); Iterator<? extends CharSequence> i = terms.iterator(); int maxLength = Integer.MIN_VALUE, minPrefixLength = Integer.MIN_VALUE; if ( i.hasNext() ) { CharSequence currTerm; MutableString prevTerm = new MutableString( i.next() ); // We cannot assume that the string will persist after a call to next(). int k, prevLength = prevTerm.length(), currLength; maxLength = prevLength; while( i.hasNext() ) { currTerm = i.next(); currLength = currTerm.length(); if ( weightLength == WEIGHT_UNKNOWN_SORTED_TERMS ) { for( k = 0; k < prevLength && k < currLength && currTerm.charAt( k ) == prevTerm.charAt( k ); k++ ); if ( k == currLength && k == prevLength ) throw new IllegalArgumentException( "The term list contains a duplicate (" + currTerm + ")" ); minPrefixLength = Math.max( minPrefixLength, k + 1 ); prevTerm.replace( currTerm ); // We cannot assume that the string will persist after a call to next(). prevLength = currLength; } maxLength = Math.max( maxLength, currLength ); } } weightLength = weightLength == WEIGHT_UNKNOWN_SORTED_TERMS ? minPrefixLength : maxLength; LOGGER.info( "Completed [max term length=" + maxLength + "; weight length=" + weightLength + "]." ); } if ( weightLength < 0 ) weightLength = 0; weight0 = new int[ weightLength ]; weight1 = new int[ weightLength ]; weight2 = new int[ weightLength ]; init = new int[ 3 ]; this.weightLength = weightLength; if ( n < TERM_THRESHOLD ) { int j = 0; t = new MutableString[ n ]; for( Iterator<? extends CharSequence> i = terms.iterator(); i.hasNext(); ) t[ j++ ] = new MutableString( i.next() ); } else { t = null; new Visit( terms ); } } /** Creates a new order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file * of terms using the given number of weights. * * @param termFile an file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @param weightLength the number of weights used generating the * intermediate hash functions, {@link #WEIGHT_UNKNOWN} or {@link #WEIGHT_UNKNOWN_SORTED_TERMS}. * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}. * @see #MinimalPerfectHash(Iterable, int) */ public MinimalPerfectHash( final String termFile, final String encoding, int weightLength, boolean zipped ) { this( new FileLinesCollection( termFile, encoding, zipped ), weightLength ); } /** Creates a new order-preserving minimal perfect hash table for the (possibly <samp>gzip</samp>'d) given file * of terms. * * @param termFile an file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @param zipped if true, the provided file is zipped and will be opened using a {@link GZIPInputStream}. * @see #MinimalPerfectHash(Iterable, int) */ public MinimalPerfectHash( final String termFile, final String encoding, boolean zipped ) { this( termFile, encoding, WEIGHT_UNKNOWN, zipped ); } /** Creates a new order-preserving minimal perfect hash table for the given file * of terms using the given number of weights. * * @param termFile an file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @param weightLength the number of weights used generating the * intermediate hash functions, {@link #WEIGHT_UNKNOWN} or {@link #WEIGHT_UNKNOWN_SORTED_TERMS}. * @see #MinimalPerfectHash(Iterable, int) */ public MinimalPerfectHash( final String termFile, final String encoding, int weightLength ) { this( new FileLinesCollection( termFile, encoding ), weightLength ); } /** Creates a new order-preserving minimal perfect hash table for the given file * of terms. * * @param termFile an file containing one term on each line; it is assumed that * it does not contain terms with a common prefix of * <code>weightLength</code> characters. * @param encoding the encoding of <code>termFile</code>; if <code>null</code>, it * is assumed to be the platform default encoding. * @see #MinimalPerfectHash(Iterable, int) */ public MinimalPerfectHash( final String termFile, final String encoding ) { this( termFile, encoding, WEIGHT_UNKNOWN, false ); } /** Creates a new minimal perfect hash by copying a given one; non-transient fields are (shallow) copied. * * @param mph the perfect hash to be copied. */ protected MinimalPerfectHash( final MinimalPerfectHash mph ) { this.n = mph.n; this.m = mph.m; this.weightLength = mph.weightLength; this.weight0 = mph.weight0; this.weight1 = mph.weight1; this.weight2 = mph.weight2; this.init = mph.init; this.g = mph.g; this.rightShift = mph.rightShift; this.n4 = mph.n4; this.t = mph.t; } /** The internal state of a visit. */ private class Visit { /** {@link #m} plus 2. */ final int mPlusOverhead = m + NODE_OVERHEAD; /** An 3×n array recording the triple of vertices involved in each hyperedge. It is *reversed* w.r.t. what you would expect to reduce object creation. */ final int[][] edge = new int[ 3 ][ n ]; /** Whether a hyperedge has been already removed. */ final boolean[] removed = new boolean[ n ]; /** For each vertex of the intermediate hypergraph, the vector of incident hyperedges. */ final int[] inc = new int[ mPlusOverhead * 3 ]; /** The next position to fill in the respective incidence vector. * Used also to store the hyperedge permutation and speed up permute(). */ final int[] last = new int[ mPlusOverhead ]; /** For each vertex of the intermediate hypergraph, the offset into * the vector of incident hyperedges. Used also to speed up permute(). */ final int[] incOffset = new int[ mPlusOverhead ]; /** The hyperedge stack. Used also to invert the hyperedge permutation. */ final int[] stack = new int[ n ]; /** The degree of each vertex of the intermediate hypergraph. */ final int[] d = new int[ mPlusOverhead ]; /** Initial top of the hyperedge stack. */ int top; public Visit( final Iterable<? extends CharSequence> terms ) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -