📄 selectstats.java
字号:
package it.unimi.dsi.mg4j.test;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.util.Properties;import java.io.FileReader;import java.io.IOException;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** Selects part of a stats using global frequency. */final public class SelectStats { @SuppressWarnings("unused") private final static Logger LOGGER = Util.getLogger( SelectStats.class ); private SelectStats() {} /** A reasonable format for real numbers. */ private static final java.text.NumberFormat formatDouble = new java.text.DecimalFormat( "#,##0.00000" ); /** Formats a number. * * <P>This method formats a double separating thousands and printing just two fractional digits. * @param d a number. * @return a string containing a pretty print of the number. */ public static String format( final double d ) { final StringBuffer s = new StringBuffer(); return formatDouble.format( d, s, new java.text.FieldPosition( 0 ) ).toString(); } public static void main( final String[] arg ) throws IOException, JSAPException, ConfigurationException { SimpleJSAP jsap = new SimpleJSAP( SelectStats.class.getName(), "Prints or selects parts of a stat file using global counts.", new Parameter[] { new Switch( "print", 'p', "print", "Just print global occurrences." ), new FlaggedOption( "globalFrequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "global-frequency", "The global count divided by the sum of document lengths that will be used to choose words to dump." ), new FlaggedOption( "quantumBitLength", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'q', "quantum-bit-length", "The quantum bit length that will be used to choose words to dump." ), new FlaggedOption( "error", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "error", "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump." ), new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The index basename." ), new UnflaggedOption( "statFile", JSAP.STRING_PARSER, JSAP.REQUIRED, "The stat file to be scanned." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final boolean print = jsapResult.getBoolean( "print" ); final String basename = jsapResult.getString( "basename" ); final String statFile = jsapResult.getString( "statFile" ); final int quantumBitLength = jsapResult.getInt( "quantumBitLength", 0 ); final double globalFrequency = jsapResult.getDouble( "globalFrequency", 0 ); final int error = jsapResult.getInt( "error", 1 ); final double lowGlobFreq = globalFrequency * ( 1 - error / 100.0 ); final double highGlobFreq = globalFrequency * ( 1 + error / 100.0 ); final int lowQbl= (int)Math.round(quantumBitLength * ( 1 - error / 100.0 )); final int highQbl = (int)Math.round( quantumBitLength* ( 1 + error / 100.0 ) ); final Properties properties = new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION ); final int numberOfTerms = properties.getInt( Index.PropertyKeys.TERMS ); final long numberOfoccurrences = properties.getLong( Index.PropertyKeys.OCCURRENCES ); final InputBitStream globCounts = new InputBitStream( basename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ); long gc[] = new long[ numberOfTerms ]; for( int t = 0; t < numberOfTerms; t++ ) gc[ t ] = globCounts.readLongGamma(); globCounts.close(); final MutableString line = new MutableString(); MutableString number; final FastBufferedReader reader = new FastBufferedReader( new FileReader( statFile ) ); boolean dumping = false; int f, q; reader.readLine( line ); while( reader.readLine( line ) != null ) { if ( line.charAt( 0 ) == '#' ) { number = line.substring( 2 ); f = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() ); double freq = (double)gc[ f ] / numberOfoccurrences; if ( print ) System.out.println( line + " " + format( freq ) ); else { if ( quantumBitLength != 0 ) { // We choose using the quantum bit length number = line.substring( 2 ); number = number.substring( number.indexOf( ' ' ) + 1 ); q = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() ); dumping = q >= lowQbl && q <= highQbl; } else dumping = freq >= lowGlobFreq && freq <= highGlobFreq; } if ( dumping ) line.println( System.out ); } else if ( ! print && dumping ) { line.println( System.out ); } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -