⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 producednffromlines.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.test;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.Arrays;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.UnflaggedOption;/** Reads a sequence of documents represented as blank-separated  * sequences of words, where documents are separated by new-lines. * Produces and prints <var>q</var> DNF queries (OR's of AND's)  * as follows: for every query, <var>k</var> documents are selected * at random, and from each of them <var>h</var> words at most are * selected. The query is a <var>k</var>-ary OR of the corresponding * AND's. */final public class ProduceDNFFromLines {	private final static Logger LOGGER = Util.getLogger( ProduceDNFFromLines.class );		private ProduceDNFFromLines() {}	public static void main( final String[] arg ) throws IOException, JSAPException {		SimpleJSAP jsap = new SimpleJSAP( ProduceDNFFromLines.class.getName(), "Prints or selects parts of a stat file using global counts.",			new Parameter[] {				new UnflaggedOption( "numberOfDocuments", JSAP.INTEGER_PARSER, JSAP.REQUIRED, "The number of documents." ),				new FlaggedOption( "queries", JSAP.INTEGER_PARSER, "1", JSAP.NOT_REQUIRED, 'q', "queries", "The number of queries to be produced." ),				new FlaggedOption( "docperquery", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'd', "docperquery", "The number of documents per query." ),				new FlaggedOption( "wordsperdoc", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'w', "words", "The (maximum) number of words per document." ),					});		JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		final int numberOfDocuments = jsapResult.getInt( "numberOfDocuments" );		final int queries = jsapResult.getInt( "queries" );		final int docperquery = jsapResult.getInt( "docperquery" );		final int wordsperdoc = jsapResult.getInt( "wordsperdoc" );				if ( docperquery > numberOfDocuments ) {			System.err.println( "There are not enough documents for the number of documents/query required" );			System.exit( 1 );		} 				int i, j, q, t;		final int docs[] = new int[ numberOfDocuments ];		final int docForQuery[][] = new int[ queries ][ docperquery ];		final String query[][][] = new String[ queries ][ docperquery ][ wordsperdoc ];		final int coveredForQuery[] = new int [ queries ];		int maxDoc = 0;		final boolean[] used = new boolean[ numberOfDocuments ];		for ( i = 0; i < numberOfDocuments; i++ ) docs[ i ] = i;		for ( q = 0; q < queries; q++ ) {			for ( i = 0; i < docperquery; i++ ) {				j = i + (int)( ( numberOfDocuments - i ) * Math.random() );				t = docs[ i ]; docs[ i ] = docs[ j ]; docs[ j ] = t;				docForQuery[ q ][ i ] = docs[ i ];				used[ docs[ i ] ] = true;				if ( docs[ i ] > maxDoc ) maxDoc = docs[ i ];			}			Arrays.sort( docForQuery[ q ] );		}				//for ( q = 0; q < queries; q++ )	System.out.println( "Query " + q + ": " + new IntArrayList( docForQuery[ q ] ) );							String split[];		int words[] = new int[ 1024 ];		final FastBufferedReader reader = new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ) );				int lineNumber = 0;		int numberOfPartialQueries = queries;		ProgressLogger pl = new ProgressLogger( LOGGER );		pl.itemsName = "Klines";		pl.expectedUpdates = maxDoc / 1000;		pl.start( "Generating queries..." );		MutableString line = new MutableString();		while( reader.readLine( line ) != null && numberOfPartialQueries > 0 ) {			if ( used[ lineNumber ] ) {				for ( q = 0; q < queries; q++ ) 					if ( coveredForQuery[ q ] < docperquery && docForQuery[ q ][ coveredForQuery[ q ] ] == lineNumber ) {						split = line.toString().split( " " );						int nw = split.length;						words = IntArrays.ensureCapacity( words, nw + 1 );						for ( i = 0; i < nw; i++ ) words[ i ] = i;						for ( i = 0; i < Math.min( wordsperdoc, nw ); i++ ) {							j = i + (int)( ( nw - i ) * Math.random() );							t = words[ i ]; words[ i ] = words[ j ]; words[ j ] = t;							query[ q ][ coveredForQuery[ q ] ][ i ] = split[ words[ i ] ];						}						coveredForQuery[ q ]++;						if ( coveredForQuery[ q ] == docperquery ) numberOfPartialQueries--;					}			}			lineNumber++;			if ( lineNumber % 1000 == 0 ) pl.update();		}		pl.done();		MutableString p[] = new MutableString[ Math.max( queries, wordsperdoc ) ], s = new MutableString();		for( i = 0; i < p.length; i++ ) p[ i ] = new MutableString();		for ( q = 0; q < queries; q++ ) {			for( int d = 0; d < wordsperdoc; d++ ) {				int last = 0;				while( last < wordsperdoc && query[ q ][ d ][ last ] != null ) last++;				p[ d ].replace( '(' ).append( query[ q ][ d ], 0, last, " AND " ).append( ')' );			}			System.out.println( s.length( 0 ).append( p, 0, queries, " OR " ) );		}		ArrayList<String> l = new ArrayList<String>();		final String[] emptyArray = new String[ 0 ];		for ( q = 0; q < queries; q++ ) {			for( int w = 0; w < wordsperdoc; w++ ) {				l.clear();				for( int d = 0; d < wordsperdoc; d++ ) if ( query[ q ][ d ][ w ] != null ) l.add( query[ q ][ d ][ w ] );				p[ w ].replace( '(' ).append( l.toArray( emptyArray ), " OR " ).append( ')' );			}			System.err.println( s.length( 0 ).append( p, 0, wordsperdoc, " AND " ) );		}	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -