📄 dumpwordblocks.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.test;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * *//** Dumps various data about an inverted list. */final public class DumpWordBlocks {	//private final static Logger LOGGER = Util.getLogger( DumpWordBlocks.class );		private DumpWordBlocks() {}	/** A reasonable format for real numbers. */	private static final java.text.NumberFormat FORMAT_DOUBLE = new java.text.DecimalFormat( "#,##0.00000" );		/** Formats a number.	 *	 * <P>This method formats a double separating thousands and printing just two fractional digits.	 * @param d a number.	 * @return a string containing a pretty print of the number.	 */	public static String format( final double d ) {		final StringBuffer s = new StringBuffer();		return FORMAT_DOUBLE.format( d, s, new java.text.FieldPosition( 0 ) ).toString();	}	/*	public static void main( final String[] arg ) throws IOException, JSAPException, ConfigurationException, ClassNotFoundException, InstantiationException, IllegalAccessException {		SimpleJSAP jsap = new SimpleJSAP( DumpWordBlocks.class.getName(), "Dumps data about terms in an index.",			new Parameter[] {				new Switch( "pointers", JSAP.NO_SHORTFLAG, "pointers", "Dump pointers." ),				new Switch( "gaps", JSAP.NO_SHORTFLAG, "gaps", "Dump gaps between pointers." ),				new Switch( "gapBits", JSAP.NO_SHORTFLAG, "gab-bits", "Dump lengths in bits of gaps between pointers." ),				new Switch( "counts", JSAP.NO_SHORTFLAG, "counts", "Dump counts." ),				new Switch( "relCounts", JSAP.NO_SHORTFLAG, "rel-counts", "Dump relative counts (counts divided by document size)." ),				new Switch( "posBits", JSAP.NO_SHORTFLAG, "pos-bits", "Dump lengths in bits of occurrence lists." ),				new Switch( "recordPositions", JSAP.NO_SHORTFLAG, "record-positions", "Dump bit positions (offsets from the start of the list) to document records." ),				new Switch( "separators", 's', "separators", "Adds a comment containing the term index." ),				new FlaggedOption( "word", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'w', "word", "The index of a word whose data has to be dumped." ),				new FlaggedOption( "frequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "frequency", "The relative frequency that will be used to choose words to dump." ),				new FlaggedOption( "globalFrequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "global-frequency", "The global count divided by the sum of document lengths that will be used to choose words to dump." ),				new FlaggedOption( "error", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "error", "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump." ),				new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index." )		});		JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		final boolean gaps = jsapResult.getBoolean( "gaps" );		final boolean pointers = jsapResult.getBoolean( "pointers" );		final boolean gapBits = jsapResult.getBoolean( "gapBits" );		final boolean counts = jsapResult.getBoolean( "counts" );		final boolean relCounts = jsapResult.getBoolean( "relCounts" );		final boolean posBits = jsapResult.getBoolean( "posBits" );		final boolean recordPositions = jsapResult.getBoolean( "recordPositions" );		final boolean separators = jsapResult.getBoolean( "separators" );		final double frequency = jsapResult.getObject( "frequency" ) != null ? jsapResult.getDouble( "frequency" ) : 0;		final double globalFrequency = jsapResult.getObject( "globalFrequency" ) != null ? jsapResult.getDouble( "globalFrequency" ) : 0;		final int error = jsapResult.getInt( "error", 1 );		final double lowFreq = frequency * ( 1 - error / 100.0 );		final double highFreq = frequency * ( 1 + error / 100.0 );		final double lowGlobFreq = globalFrequency * ( 1 - error / 100.0 );		final double highGlobFreq = globalFrequency * ( 1 + error / 100.0 );		final String basename = jsapResult.getString( "basename" );		final Properties properties = new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION );		final int numberOfDocuments = properties.getInt( Index.PropertyKeys.DOCUMENTS );		final int numberOfTerms = properties.getInt( Index.PropertyKeys.TERMS );		final long numberOfOccurrences = properties.getLong( Index.PropertyKeys.OCCURRENCES );		final IntList size = DiskBasedIndex.readSizes( new InputBitStream( basename + DiskBasedIndex.SIZES_EXTENSION ), numberOfDocuments );				DiskBasedIndex index = DiskBasedIndex.getInstance( basename );		final IntList wordsToDump;				if ( jsapResult.getObject( "word" ) != null ) {			wordsToDump = IntLists.singleton( jsapResult.getInt( "word" ) );			LOGGER.debug( "Dumping word " + wordsToDump );		}		else {			if ( frequency == 0 && globalFrequency == 0 ) throw new IllegalArgumentException( "You must specify either a word or a frequency range" );						final int min = (int)Math.round( lowFreq * numberOfDocuments ); 			final int max = (int)Math.round( highFreq * numberOfDocuments );			final long globMin = Math.round( lowGlobFreq * numberOfOccurrences ); 			final long globMax = Math.round( highGlobFreq * numberOfOccurrences );			if ( frequency != 0 ) LOGGER.debug( "Dumping words in relative frequency range [" + format( lowFreq ) + ", " + format( highFreq ) + "] (" + numberOfDocuments + " documents, frequency range [" + min + ", " + max + "])" );			if ( globalFrequency != 0 ) LOGGER.debug( "Dumping words in relative global count range [" + format( lowGlobFreq ) + ", " + format( highGlobFreq ) + "] (" + numberOfOccurrences + " documents, global count range [" + globMin + ", " + globMax + "])" );				wordsToDump = new IntArrayList();							final InputBitStream frequencies = new InputBitStream( new FileInputStream ( basename + DiskBasedIndex.FREQUENCIES_EXTENSION ) );			final InputBitStream globCounts = new InputBitStream( new FileInputStream ( basename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ) );			int f;			long fl;			for( int t = 0; t < numberOfTerms; t++ ) {				f = frequencies.readGamma();				fl = globCounts.readLongGamma();				if ( frequency != 0 && ( f < min || f > max ) ) continue;				if ( globalFrequency != 0 && ( fl < globMin || fl > globMax ) ) continue;				wordsToDump.add( t );			}			frequencies.close();			globCounts.close();		}				LOGGER.debug( "Dumping " + wordsToDump.size() + " words..." );		int j, pointer, numOccs, prevPointer;		long start, startOccs;		IndexReader indexReader = index.getReader();				for( int i = 0; i < wordsToDump.size(); i++ ) {			final int word = wordsToDump.getInt( i );			IndexIterator indexIterator = indexReader.documents( word );			j = indexIterator.frequency();			if ( separators ) System.out.println( "# " + word + " (frequency " + j + ", relative " + (double)j / numberOfDocuments + ")" );			prevPointer = -1;						while( j-- != 0 ) {				start = indexReader.readBits();				pointer = indexIterator.nextDocument();				if ( pointers ) System.out.println( pointer );				if ( gaps ) System.out.println( pointer - prevPointer - 1 );				if ( gapBits ) System.out.println( indexReader.readBits() - start );				prevPointer = pointer;								startOccs = indexReader.readBits();				numOccs = indexIterator.count();				if ( posBits ) System.out.println( indexReader.readBits() - startOccs );				if ( recordPositions ) System.out.println( indexReader.readBits() - start );				if ( counts ) System.out.println( numOccs );				if ( relCounts ) System.out.println( (double)numOccs / size.getInt( pointer ) );			}			if ( gaps ) System.out.println( numberOfDocuments - prevPointer );		}					LOGGER.debug( " done." );	}*/}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -