📄 producednffromlines.java
字号:
package it.unimi.dsi.mg4j.test;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.Arrays;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.UnflaggedOption;/** Reads a sequence of documents represented as blank-separated * sequences of words, where documents are separated by new-lines. * Produces and prints <var>q</var> DNF queries (OR's of AND's) * as follows: for every query, <var>k</var> documents are selected * at random, and from each of them <var>h</var> words at most are * selected. The query is a <var>k</var>-ary OR of the corresponding * AND's. */final public class ProduceDNFFromLines { private final static Logger LOGGER = Util.getLogger( ProduceDNFFromLines.class ); private ProduceDNFFromLines() {} public static void main( final String[] arg ) throws IOException, JSAPException { SimpleJSAP jsap = new SimpleJSAP( ProduceDNFFromLines.class.getName(), "Prints or selects parts of a stat file using global counts.", new Parameter[] { new UnflaggedOption( "numberOfDocuments", JSAP.INTEGER_PARSER, JSAP.REQUIRED, "The number of documents." ), new FlaggedOption( "queries", JSAP.INTEGER_PARSER, "1", JSAP.NOT_REQUIRED, 'q', "queries", "The number of queries to be produced." ), new FlaggedOption( "docperquery", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'd', "docperquery", "The number of documents per query." ), new FlaggedOption( "wordsperdoc", JSAP.INTEGER_PARSER, "2", JSAP.NOT_REQUIRED, 'w', "words", "The (maximum) number of words per document." ), }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final int numberOfDocuments = jsapResult.getInt( "numberOfDocuments" ); final int queries = jsapResult.getInt( "queries" ); final int docperquery = jsapResult.getInt( "docperquery" ); final int wordsperdoc = jsapResult.getInt( "wordsperdoc" ); if ( docperquery > numberOfDocuments ) { System.err.println( "There are not enough documents for the number of documents/query required" ); System.exit( 1 ); } int i, j, q, t; final int docs[] = new int[ numberOfDocuments ]; final int docForQuery[][] = new int[ queries ][ docperquery ]; final String query[][][] = new String[ queries ][ docperquery ][ wordsperdoc ]; final int coveredForQuery[] = new int [ queries ]; int maxDoc = 0; final boolean[] used = new boolean[ numberOfDocuments ]; for ( i = 0; i < numberOfDocuments; i++ ) docs[ i ] = i; for ( q = 0; q < queries; q++ ) { for ( i = 0; i < docperquery; i++ ) { j = i + (int)( ( numberOfDocuments - i ) * Math.random() ); t = docs[ i ]; docs[ i ] = docs[ j ]; docs[ j ] = t; docForQuery[ q ][ i ] = docs[ i ]; used[ docs[ i ] ] = true; if ( docs[ i ] > maxDoc ) maxDoc = docs[ i ]; } Arrays.sort( docForQuery[ q ] ); } //for ( q = 0; q < queries; q++ ) System.out.println( "Query " + q + ": " + new IntArrayList( docForQuery[ q ] ) ); String split[]; int words[] = new int[ 1024 ]; final FastBufferedReader reader = new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ) ); int lineNumber = 0; int numberOfPartialQueries = queries; ProgressLogger pl = new ProgressLogger( LOGGER ); pl.itemsName = "Klines"; pl.expectedUpdates = maxDoc / 1000; pl.start( "Generating queries..." ); MutableString line = new MutableString(); while( reader.readLine( line ) != null && numberOfPartialQueries > 0 ) { if ( used[ lineNumber ] ) { for ( q = 0; q < queries; q++ ) if ( coveredForQuery[ q ] < docperquery && docForQuery[ q ][ coveredForQuery[ q ] ] == lineNumber ) { split = line.toString().split( " " ); int nw = split.length; words = IntArrays.ensureCapacity( words, nw + 1 ); for ( i = 0; i < nw; i++ ) words[ i ] = i; for ( i = 0; i < Math.min( wordsperdoc, nw ); i++ ) { j = i + (int)( ( nw - i ) * Math.random() ); t = words[ i ]; words[ i ] = words[ j ]; words[ j ] = t; query[ q ][ coveredForQuery[ q ] ][ i ] = split[ words[ i ] ]; } coveredForQuery[ q ]++; if ( coveredForQuery[ q ] == docperquery ) numberOfPartialQueries--; } } lineNumber++; if ( lineNumber % 1000 == 0 ) pl.update(); } pl.done(); MutableString p[] = new MutableString[ Math.max( queries, wordsperdoc ) ], s = new MutableString(); for( i = 0; i < p.length; i++ ) p[ i ] = new MutableString(); for ( q = 0; q < queries; q++ ) { for( int d = 0; d < wordsperdoc; d++ ) { int last = 0; while( last < wordsperdoc && query[ q ][ d ][ last ] != null ) last++; p[ d ].replace( '(' ).append( query[ q ][ d ], 0, last, " AND " ).append( ')' ); } System.out.println( s.length( 0 ).append( p, 0, queries, " OR " ) ); } ArrayList<String> l = new ArrayList<String>(); final String[] emptyArray = new String[ 0 ]; for ( q = 0; q < queries; q++ ) { for( int w = 0; w < wordsperdoc; w++ ) { l.clear(); for( int d = 0; d < wordsperdoc; d++ ) if ( query[ q ][ d ][ w ] != null ) l.add( query[ q ][ d ][ w ] ); p[ w ].replace( '(' ).append( l.toArray( emptyArray ), " OR " ).append( ')' ); } System.err.println( s.length( 0 ).append( p, 0, wordsperdoc, " AND " ) ); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -