📄 minimalperfecthash.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
			// We cache all variables for faster access			final int[][] edge = this.edge;			final int[] last = this.last;			final int[] inc = this.inc;			final int[] incOffset = this.incOffset;			final int[] stack = this.stack;			final int[] d = this.d;			final MersenneTwister r = new MersenneTwister( new Date() );			int i, j, k, s, v = -1;			final int[] h = new int[ 3 ];						do {								LOGGER.info( "Generating random hypergraph..." );				top = 0;								init[ 0 ] = r.nextInt();				init[ 1 ] = r.nextInt();				init[ 2 ] = r.nextInt();								/* We choose the random weights for the three intermediate hash functions. */				for ( i = 0; i < weightLength; i++ ) {					weight0[ i ] = r.nextInt();					weight1[ i ] = r.nextInt();					weight2[ i ] = r.nextInt();				}					 					 				/* We build the hyperedge list, checking that we do not create a degenerate hyperedge. */				i = 0;				CharSequence cs = null;				IntArrays.fill( d, 0 );				for( Iterator<? extends CharSequence> w = terms.iterator(); w.hasNext(); ) {					hash( cs = w.next(), h );					if ( h[ 0 ] == h[ 1 ] || h[ 1 ] == h[ 2 ] || h[ 2 ] == h[ 0 ] ) break;					edge[ 0 ][ i ] = h[ 0 ];					edge[ 1 ][ i ] = h[ 1 ];					edge[ 2 ][ i ] = h[ 2 ];					i++;				}					 				if ( i < n ) {					LOGGER.info( "Hypergraph generation interrupted by degenerate hyperedge " + i + " (string: \"" + cs + "\")." );					continue;				} 				/* We compute the degree of each vertex. */				for( j = 0; j < 3; j++ ) {					i = n;					while( i-- != 0 ) d[ edge[ j ][ i ] ]++; 				}				LOGGER.info( "Checking for duplicate hyperedges..." );						/* Now we quicksort hyperedges lexicographically, keeping into last their permutation. */				i = n;				while( i-- != 0 ) last[ i ] = i;								GenericSorting.quickSort( 0, n, new IntComparator() {					public int compare( final int x, final int y ) {						int r;						if ( ( r = edge[ 0 ][ x ] - edge[ 0 ][ y ] ) != 0 ) return r;						if ( ( r = edge[ 1 ][ x ] - edge[ 1 ][ y ] ) != 0 ) return r;						return edge[ 2 ][ x ] - edge[ 2 ][ y ];					}				},				new Swapper() {					public void swap( final int x, final int y ) {						int e0 = edge[ 0 ][ x ], e1 = edge[ 1 ][ x ], e2 = edge[ 2 ][ x ], p = last[ x ];						edge[ 0 ][ x ] = edge[ 0 ][ y ];						edge[ 1 ][ x ] = edge[ 1 ][ y ];						edge[ 2 ][ x ] = edge[ 2 ][ y ];						edge[ 0 ][ y ] = e0;						edge[ 1 ][ y ] = e1;						edge[ 2 ][ y ] = e2;						last[ x ] = last[ y ];						last[ y ] = p;					}				}				);								i = n - 1;				while( i-- != 0 ) if ( edge[ 0 ][ i + 1 ] == edge[ 0 ][ i ] && edge[ 1 ][ i + 1 ] == edge[ 1 ][ i ] && edge[ 2 ][ i + 1 ] == edge[ 2 ][ i ]) break;				if ( i != -1 ) {					LOGGER.info( "Found double hyperedge for terms " + last[ i + 1 ] + " and " + last[ i ] + "." );					continue;				}								/* We now invert last and permute all hyperedges back into their place. Note that				 * we use last and incOffset to speed up the process. */				i = n;				while( i-- != 0 ) stack[ last[ i ] ] = i;								GenericPermuting.permute( stack, new Swapper() {					public void swap( final int x, final int y ) {						int e0 = edge[ 0 ][ x ], e1 = edge[ 1 ][ x ], e2 = edge[ 2 ][ x ];						edge[ 0 ][ x ] = edge[ 0 ][ y ];						edge[ 1 ][ x ] = edge[ 1 ][ y ];						edge[ 2 ][ x ] = edge[ 2 ][ y ];						edge[ 0 ][ y ] = e0;						edge[ 1 ][ y ] = e1;						edge[ 2 ][ y ] = e2;					}				}, last, incOffset				);								LOGGER.info( "Visiting hypergraph..." );					 				/* We set up the offset of each vertex in the incidence				   vector. This is necessary to avoid creating m incidence vectors at				   each round. */				IntArrays.fill( last, 0 );				incOffset[ 0 ] = 0;				for( i = 1; i < mPlusOverhead; i++ ) incOffset[ i ] = incOffset[ i - 1 ] + d[ i - 1 ];					 				/* We fill the vector. */				for( i = 0; i < n; i++ ) 					for( j = 0; j < 3; j++ ) {						v = edge[ j ][ i ];						inc[ incOffset[ v ] + last[ v ]++ ] = i;					}					 				/* We visit the hypergraph. */				BooleanArrays.fill( removed, false );				for( i = 0; i < mPlusOverhead; i++ ) if ( d[ i ] == 1 ) visit( i );					 				if ( top == n ) LOGGER.info( "Visit completed." );				else LOGGER.info( "Visit failed: stripped " + top + " hyperedges out of " + n + "." );			} while ( top != n );							LOGGER.info( "Assigning values..." );			/* We assign values. */			IntArrays.fill( g, -1 );			while( top > 0 ) {				k = stack[ --top ];				s = 0;				for ( j = 0; j < 3; j++ ) {					if ( g[ edge[ j ][ k ] ] == -1 ) {						g[ edge[ j ][ k ] ] = 0;						v = edge[ j ][ k ];					}					else s += g[ edge[ j ][ k ] ];				}								g[v] = ( ( k - s ) % n + n ) % n;			}			LOGGER.info( "Completed." );		}		/*		This is the original recursive visit. It is here for documentation purposes. It		cannot handle more than about 1,000,000 terms.		private void visit( int x ) {		int i, j, k = -1;					for( i = 0; i < last[x]; i++ ) 		if ( !removed[ k = ((int[])inc[x])[i] ] ) break; // The only hyperedge incident on x in the current configuration.		stack[top++] = k;		// We update the degrees and the incidence lists. 		removed[k] = true;		for( i = 0; i < 3; i++ ) d[edge[i][k]]--;						// We follow recursively the other vertices of the hyperedge, if they have degree one in the current configuration.		for( i = 0; i < 3; i++ ) 		if ( edge[i][k] != x && d[edge[i][k]] == 1 ) 		visit( edge[i][k] );		}		*/		final int[] recStackI = new int[ n ]; // The stack for i.		final int[] recStackK = new int[ n ]; // The stack for k.		private void visit( int x ) {			// We cache all variables for faster access			final int[] recStackI = this.recStackI;			final int[] recStackK = this.recStackK;			final int[][] edge = this.edge;			final int[] last = this.last;			final int[] inc = this.inc;			final int[] incOffset = this.incOffset;			final int[] stack = this.stack;			final int[] d = this.d;			final boolean[] removed = this.removed;						int i, k = -1;			boolean inside;			// Stack initialization			int recTop = 0; // Initial top of the recursion stack.			inside = false;						while ( true ) {				if ( ! inside ) {					for ( i = 0; i < last[ x ]; i++ ) 						if ( !removed[ k = inc[ incOffset[ x ] + i ] ] ) break; // The only hyperedge incident on x in the current configuration.					// TODO: k could be wrong if the graph is regular and cyclic.					stack[ top++ ] = k;										/* We update the degrees and the incidence lists. */					removed[ k ] = true;					for ( i = 0; i < 3; i++ ) d[ edge[ i ][ k ] ]--;				}								/* We follow recursively the other vertices of the hyperedge, if they have degree one in the current configuration. */				for( i = 0; i < 3; i++ ) 					if ( edge[ i ][ k ] != x && d[ edge[ i ][ k ] ] == 1 ) {						recStackI[ recTop ] = i + 1;						recStackK[ recTop ] = k;						recTop++;						x = edge[ i ][ k ];						inside = false;						break;					}				if ( i < 3 ) continue;								if ( --recTop < 0 ) return;				i = recStackI[ recTop ];				k = recStackK[ recTop ];				inside = true;			}		}	}	private void writeObject( final ObjectOutputStream s ) throws IOException {		s.defaultWriteObject();		if ( n < TERM_THRESHOLD ) s.writeObject( t );	}	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {		s.defaultReadObject();		n4 = n * 4;		if ( n < TERM_THRESHOLD ) t = (CharSequence[])s.readObject();	}	/** A main method for minimal perfect hash construction that accepts a default class.	 *  	 * @param klass the default class to be built.	 * @param arg the usual argument array.	 */    @SuppressWarnings("unused")	protected static void main( final Class<? extends MinimalPerfectHash> klass, final String[] arg ) throws InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, IOException, JSAPException, ClassNotFoundException {		final SimpleJSAP jsap = new SimpleJSAP( klass.getName(), "Builds a minimal perfect hash table reading a newline-separated list of terms.",				new Parameter[] {					new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read terms." ),					new FlaggedOption( "class", MG4JClassParser.getParser(), klass.getName(), JSAP.NOT_REQUIRED, 'c', "class", "A subclass of MinimalPerfectHash to be used when creating the table." ),					new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term file encoding." ),					new Switch( "zipped", 'z', "zipped", "The term list is compressed in gzip format." ),					new Switch( "sorted", 's', "sorted", "The term list is sorted--optimise weight length." ),					new Switch( "check", 'C', "check", "Check an existing map rather than build a new one." ),					new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input." ),					new UnflaggedOption( "table", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised minimal perfect hash table." )		});				JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;				final int bufferSize = jsapResult.getInt( "bufferSize" );		final String tableName = jsapResult.getString( "table" );		final String termFile = jsapResult.getString( "termFile" );		final Class<?> tableClass = jsapResult.getClass( "class" );		final Charset encoding = (Charset)jsapResult.getObject( "encoding" );		final boolean sorted = jsapResult.getBoolean( "sorted" );		final boolean zipped = jsapResult.getBoolean( "zipped" );		final boolean check = jsapResult.getBoolean( "check" );				final MinimalPerfectHash minimalPerfectHash;				if ( check ) {			TermMap mph = (TermMap)BinIO.loadObject( tableName );			final it.unimi.dsi.logging.ProgressLogger pl = new it.unimi.dsi.logging.ProgressLogger( LOGGER );			pl.itemsName = "terms";			final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( termFile != null ? new FileInputStream( termFile ) : System.in, encoding ), bufferSize ), pl );			final int size = mph.size();			pl.start( "Reading terms..." );			for( int i = 0; i < size; i++ ) if ( mph.getNumber( termIterator.next() ) != i ) System.out.println( i );			pl.done();			if ( termIterator.hasNext() ) System.out.println( "More terms than elements in the map" );		}		else {			if ( termFile == null ) {				ArrayList<MutableString> termList = new ArrayList<MutableString>();				final it.unimi.dsi.logging.ProgressLogger pl = new it.unimi.dsi.logging.ProgressLogger( LOGGER );				pl.itemsName = "terms";				final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, encoding ), bufferSize ), pl );				pl.start( "Reading terms..." );				while( termIterator.hasNext() ) termList.add( termIterator.next().copy() );				pl.done();				LOGGER.info( "Building minimal perfect hash table..." );				minimalPerfectHash = (MinimalPerfectHash)tableClass.getConstructor( Iterable.class, int.class ).newInstance( termList, Integer.valueOf( sorted ? WEIGHT_UNKNOWN_SORTED_TERMS : WEIGHT_UNKNOWN ) );			}			else {				LOGGER.info( "Building minimal perfect hash table..." );				minimalPerfectHash = (MinimalPerfectHash)tableClass.getConstructor( String.class, String.class, int.class, boolean.class ).newInstance( termFile, encoding.toString(), Integer.valueOf( sorted ? WEIGHT_UNKNOWN_SORTED_TERMS : WEIGHT_UNKNOWN ), Boolean.valueOf( zipped ) );			}			LOGGER.info( "Writing to file..." );					BinIO.storeObject( minimalPerfectHash, tableName );			LOGGER.info( "Completed." );		}	}    public static void main( final String[] arg ) throws InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, IOException, JSAPException, ClassNotFoundException {    	main( MinimalPerfectHash.class, arg );    }}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -