📄 minimalperfecthash.java
字号:
// We cache all variables for faster access final int[][] edge = this.edge; final int[] last = this.last; final int[] inc = this.inc; final int[] incOffset = this.incOffset; final int[] stack = this.stack; final int[] d = this.d; final MersenneTwister r = new MersenneTwister( new Date() ); int i, j, k, s, v = -1; final int[] h = new int[ 3 ]; do { LOGGER.info( "Generating random hypergraph..." ); top = 0; init[ 0 ] = r.nextInt(); init[ 1 ] = r.nextInt(); init[ 2 ] = r.nextInt(); /* We choose the random weights for the three intermediate hash functions. */ for ( i = 0; i < weightLength; i++ ) { weight0[ i ] = r.nextInt(); weight1[ i ] = r.nextInt(); weight2[ i ] = r.nextInt(); } /* We build the hyperedge list, checking that we do not create a degenerate hyperedge. */ i = 0; CharSequence cs = null; IntArrays.fill( d, 0 ); for( Iterator<? extends CharSequence> w = terms.iterator(); w.hasNext(); ) { hash( cs = w.next(), h ); if ( h[ 0 ] == h[ 1 ] || h[ 1 ] == h[ 2 ] || h[ 2 ] == h[ 0 ] ) break; edge[ 0 ][ i ] = h[ 0 ]; edge[ 1 ][ i ] = h[ 1 ]; edge[ 2 ][ i ] = h[ 2 ]; i++; } if ( i < n ) { LOGGER.info( "Hypergraph generation interrupted by degenerate hyperedge " + i + " (string: \"" + cs + "\")." ); continue; } /* We compute the degree of each vertex. */ for( j = 0; j < 3; j++ ) { i = n; while( i-- != 0 ) d[ edge[ j ][ i ] ]++; } LOGGER.info( "Checking for duplicate hyperedges..." ); /* Now we quicksort hyperedges lexicographically, keeping into last their permutation. */ i = n; while( i-- != 0 ) last[ i ] = i; GenericSorting.quickSort( 0, n, new IntComparator() { public int compare( final int x, final int y ) { int r; if ( ( r = edge[ 0 ][ x ] - edge[ 0 ][ y ] ) != 0 ) return r; if ( ( r = edge[ 1 ][ x ] - edge[ 1 ][ y ] ) != 0 ) return r; return edge[ 2 ][ x ] - edge[ 2 ][ y ]; } }, new Swapper() { public void swap( final int x, final int y ) { int e0 = edge[ 0 ][ x ], e1 = edge[ 1 ][ x ], e2 = edge[ 2 ][ x ], p = last[ x ]; edge[ 0 ][ x ] = edge[ 0 ][ y ]; edge[ 1 ][ x ] = edge[ 1 ][ y ]; edge[ 2 ][ x ] = edge[ 2 ][ y ]; edge[ 0 ][ y ] = e0; edge[ 1 ][ y ] = e1; edge[ 2 ][ y ] = e2; last[ x ] = last[ y ]; last[ y ] = p; } } ); i = n - 1; while( i-- != 0 ) if ( edge[ 0 ][ i + 1 ] == edge[ 0 ][ i ] && edge[ 1 ][ i + 1 ] == edge[ 1 ][ i ] && edge[ 2 ][ i + 1 ] == edge[ 2 ][ i ]) break; if ( i != -1 ) { LOGGER.info( "Found double hyperedge for terms " + last[ i + 1 ] + " and " + last[ i ] + "." ); continue; } /* We now invert last and permute all hyperedges back into their place. Note that * we use last and incOffset to speed up the process. */ i = n; while( i-- != 0 ) stack[ last[ i ] ] = i; GenericPermuting.permute( stack, new Swapper() { public void swap( final int x, final int y ) { int e0 = edge[ 0 ][ x ], e1 = edge[ 1 ][ x ], e2 = edge[ 2 ][ x ]; edge[ 0 ][ x ] = edge[ 0 ][ y ]; edge[ 1 ][ x ] = edge[ 1 ][ y ]; edge[ 2 ][ x ] = edge[ 2 ][ y ]; edge[ 0 ][ y ] = e0; edge[ 1 ][ y ] = e1; edge[ 2 ][ y ] = e2; } }, last, incOffset ); LOGGER.info( "Visiting hypergraph..." ); /* We set up the offset of each vertex in the incidence vector. This is necessary to avoid creating m incidence vectors at each round. */ IntArrays.fill( last, 0 ); incOffset[ 0 ] = 0; for( i = 1; i < mPlusOverhead; i++ ) incOffset[ i ] = incOffset[ i - 1 ] + d[ i - 1 ]; /* We fill the vector. */ for( i = 0; i < n; i++ ) for( j = 0; j < 3; j++ ) { v = edge[ j ][ i ]; inc[ incOffset[ v ] + last[ v ]++ ] = i; } /* We visit the hypergraph. */ BooleanArrays.fill( removed, false ); for( i = 0; i < mPlusOverhead; i++ ) if ( d[ i ] == 1 ) visit( i ); if ( top == n ) LOGGER.info( "Visit completed." ); else LOGGER.info( "Visit failed: stripped " + top + " hyperedges out of " + n + "." ); } while ( top != n ); LOGGER.info( "Assigning values..." ); /* We assign values. */ IntArrays.fill( g, -1 ); while( top > 0 ) { k = stack[ --top ]; s = 0; for ( j = 0; j < 3; j++ ) { if ( g[ edge[ j ][ k ] ] == -1 ) { g[ edge[ j ][ k ] ] = 0; v = edge[ j ][ k ]; } else s += g[ edge[ j ][ k ] ]; } g[v] = ( ( k - s ) % n + n ) % n; } LOGGER.info( "Completed." ); } /* This is the original recursive visit. It is here for documentation purposes. It cannot handle more than about 1,000,000 terms. private void visit( int x ) { int i, j, k = -1; for( i = 0; i < last[x]; i++ ) if ( !removed[ k = ((int[])inc[x])[i] ] ) break; // The only hyperedge incident on x in the current configuration. stack[top++] = k; // We update the degrees and the incidence lists. removed[k] = true; for( i = 0; i < 3; i++ ) d[edge[i][k]]--; // We follow recursively the other vertices of the hyperedge, if they have degree one in the current configuration. for( i = 0; i < 3; i++ ) if ( edge[i][k] != x && d[edge[i][k]] == 1 ) visit( edge[i][k] ); } */ final int[] recStackI = new int[ n ]; // The stack for i. final int[] recStackK = new int[ n ]; // The stack for k. private void visit( int x ) { // We cache all variables for faster access final int[] recStackI = this.recStackI; final int[] recStackK = this.recStackK; final int[][] edge = this.edge; final int[] last = this.last; final int[] inc = this.inc; final int[] incOffset = this.incOffset; final int[] stack = this.stack; final int[] d = this.d; final boolean[] removed = this.removed; int i, k = -1; boolean inside; // Stack initialization int recTop = 0; // Initial top of the recursion stack. inside = false; while ( true ) { if ( ! inside ) { for ( i = 0; i < last[ x ]; i++ ) if ( !removed[ k = inc[ incOffset[ x ] + i ] ] ) break; // The only hyperedge incident on x in the current configuration. // TODO: k could be wrong if the graph is regular and cyclic. stack[ top++ ] = k; /* We update the degrees and the incidence lists. */ removed[ k ] = true; for ( i = 0; i < 3; i++ ) d[ edge[ i ][ k ] ]--; } /* We follow recursively the other vertices of the hyperedge, if they have degree one in the current configuration. */ for( i = 0; i < 3; i++ ) if ( edge[ i ][ k ] != x && d[ edge[ i ][ k ] ] == 1 ) { recStackI[ recTop ] = i + 1; recStackK[ recTop ] = k; recTop++; x = edge[ i ][ k ]; inside = false; break; } if ( i < 3 ) continue; if ( --recTop < 0 ) return; i = recStackI[ recTop ]; k = recStackK[ recTop ]; inside = true; } } } private void writeObject( final ObjectOutputStream s ) throws IOException { s.defaultWriteObject(); if ( n < TERM_THRESHOLD ) s.writeObject( t ); } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); n4 = n * 4; if ( n < TERM_THRESHOLD ) t = (CharSequence[])s.readObject(); } /** A main method for minimal perfect hash construction that accepts a default class. * * @param klass the default class to be built. * @param arg the usual argument array. */ @SuppressWarnings("unused") protected static void main( final Class<? extends MinimalPerfectHash> klass, final String[] arg ) throws InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, IOException, JSAPException, ClassNotFoundException { final SimpleJSAP jsap = new SimpleJSAP( klass.getName(), "Builds a minimal perfect hash table reading a newline-separated list of terms.", new Parameter[] { new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read terms." ), new FlaggedOption( "class", MG4JClassParser.getParser(), klass.getName(), JSAP.NOT_REQUIRED, 'c', "class", "A subclass of MinimalPerfectHash to be used when creating the table." ), new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding." ), new Switch( "zipped", 'z', "zipped", "The term list is compressed in gzip format." ), new Switch( "sorted", 's', "sorted", "The term list is sorted--optimise weight length." ), new Switch( "check", 'C', "check", "Check an existing map rather than build a new one." ), new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input." ), new UnflaggedOption( "table", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised minimal perfect hash table." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final int bufferSize = jsapResult.getInt( "bufferSize" ); final String tableName = jsapResult.getString( "table" ); final String termFile = jsapResult.getString( "termFile" ); final Class<?> tableClass = jsapResult.getClass( "class" ); final Charset encoding = (Charset)jsapResult.getObject( "encoding" ); final boolean sorted = jsapResult.getBoolean( "sorted" ); final boolean zipped = jsapResult.getBoolean( "zipped" ); final boolean check = jsapResult.getBoolean( "check" ); final MinimalPerfectHash minimalPerfectHash; if ( check ) { TermMap mph = (TermMap)BinIO.loadObject( tableName ); final it.unimi.dsi.logging.ProgressLogger pl = new it.unimi.dsi.logging.ProgressLogger( LOGGER ); pl.itemsName = "terms"; final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( termFile != null ? new FileInputStream( termFile ) : System.in, encoding ), bufferSize ), pl ); final int size = mph.size(); pl.start( "Reading terms..." ); for( int i = 0; i < size; i++ ) if ( mph.getNumber( termIterator.next() ) != i ) System.out.println( i ); pl.done(); if ( termIterator.hasNext() ) System.out.println( "More terms than elements in the map" ); } else { if ( termFile == null ) { ArrayList<MutableString> termList = new ArrayList<MutableString>(); final it.unimi.dsi.logging.ProgressLogger pl = new it.unimi.dsi.logging.ProgressLogger( LOGGER ); pl.itemsName = "terms"; final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, encoding ), bufferSize ), pl ); pl.start( "Reading terms..." ); while( termIterator.hasNext() ) termList.add( termIterator.next().copy() ); pl.done(); LOGGER.info( "Building minimal perfect hash table..." ); minimalPerfectHash = (MinimalPerfectHash)tableClass.getConstructor( Iterable.class, int.class ).newInstance( termList, Integer.valueOf( sorted ? WEIGHT_UNKNOWN_SORTED_TERMS : WEIGHT_UNKNOWN ) ); } else { LOGGER.info( "Building minimal perfect hash table..." ); minimalPerfectHash = (MinimalPerfectHash)tableClass.getConstructor( String.class, String.class, int.class, boolean.class ).newInstance( termFile, encoding.toString(), Integer.valueOf( sorted ? WEIGHT_UNKNOWN_SORTED_TERMS : WEIGHT_UNKNOWN ), Boolean.valueOf( zipped ) ); } LOGGER.info( "Writing to file..." ); BinIO.storeObject( minimalPerfectHash, tableName ); LOGGER.info( "Completed." ); } } public static void main( final String[] arg ) throws InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, IOException, JSAPException, ClassNotFoundException { main( MinimalPerfectHash.class, arg ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -