📄 dumpwordblocks.java
字号:
package it.unimi.dsi.mg4j.test;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * *//** Dumps various data about an inverted list. */final public class DumpWordBlocks { //private final static Logger LOGGER = Util.getLogger( DumpWordBlocks.class ); private DumpWordBlocks() {} /** A reasonable format for real numbers. */ private static final java.text.NumberFormat FORMAT_DOUBLE = new java.text.DecimalFormat( "#,##0.00000" ); /** Formats a number. * * <P>This method formats a double separating thousands and printing just two fractional digits. * @param d a number. * @return a string containing a pretty print of the number. */ public static String format( final double d ) { final StringBuffer s = new StringBuffer(); return FORMAT_DOUBLE.format( d, s, new java.text.FieldPosition( 0 ) ).toString(); } /* public static void main( final String[] arg ) throws IOException, JSAPException, ConfigurationException, ClassNotFoundException, InstantiationException, IllegalAccessException { SimpleJSAP jsap = new SimpleJSAP( DumpWordBlocks.class.getName(), "Dumps data about terms in an index.", new Parameter[] { new Switch( "pointers", JSAP.NO_SHORTFLAG, "pointers", "Dump pointers." ), new Switch( "gaps", JSAP.NO_SHORTFLAG, "gaps", "Dump gaps between pointers." ), new Switch( "gapBits", JSAP.NO_SHORTFLAG, "gab-bits", "Dump lengths in bits of gaps between pointers." ), new Switch( "counts", JSAP.NO_SHORTFLAG, "counts", "Dump counts." ), new Switch( "relCounts", JSAP.NO_SHORTFLAG, "rel-counts", "Dump relative counts (counts divided by document size)." ), new Switch( "posBits", JSAP.NO_SHORTFLAG, "pos-bits", "Dump lengths in bits of occurrence lists." ), new Switch( "recordPositions", JSAP.NO_SHORTFLAG, "record-positions", "Dump bit positions (offsets from the start of the list) to document records." ), new Switch( "separators", 's', "separators", "Adds a comment containing the term index." ), new FlaggedOption( "word", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'w', "word", "The index of a word whose data has to be dumped." ), new FlaggedOption( "frequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "frequency", "The relative frequency that will be used to choose words to dump." ), new FlaggedOption( "globalFrequency", JSAP.DOUBLE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "global-frequency", "The global count divided by the sum of document lengths that will be used to choose words to dump." ), new FlaggedOption( "error", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'e', "error", "The error w.r.t. frequency (as a percentage) that will be used to choose words to dump." ), new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final boolean gaps = jsapResult.getBoolean( "gaps" ); final boolean pointers = jsapResult.getBoolean( "pointers" ); final boolean gapBits = jsapResult.getBoolean( "gapBits" ); final boolean counts = jsapResult.getBoolean( "counts" ); final boolean relCounts = jsapResult.getBoolean( "relCounts" ); final boolean posBits = jsapResult.getBoolean( "posBits" ); final boolean recordPositions = jsapResult.getBoolean( "recordPositions" ); final boolean separators = jsapResult.getBoolean( "separators" ); final double frequency = jsapResult.getObject( "frequency" ) != null ? jsapResult.getDouble( "frequency" ) : 0; final double globalFrequency = jsapResult.getObject( "globalFrequency" ) != null ? jsapResult.getDouble( "globalFrequency" ) : 0; final int error = jsapResult.getInt( "error", 1 ); final double lowFreq = frequency * ( 1 - error / 100.0 ); final double highFreq = frequency * ( 1 + error / 100.0 ); final double lowGlobFreq = globalFrequency * ( 1 - error / 100.0 ); final double highGlobFreq = globalFrequency * ( 1 + error / 100.0 ); final String basename = jsapResult.getString( "basename" ); final Properties properties = new Properties( basename + DiskBasedIndex.PROPERTIES_EXTENSION ); final int numberOfDocuments = properties.getInt( Index.PropertyKeys.DOCUMENTS ); final int numberOfTerms = properties.getInt( Index.PropertyKeys.TERMS ); final long numberOfOccurrences = properties.getLong( Index.PropertyKeys.OCCURRENCES ); final IntList size = DiskBasedIndex.readSizes( new InputBitStream( basename + DiskBasedIndex.SIZES_EXTENSION ), numberOfDocuments ); DiskBasedIndex index = DiskBasedIndex.getInstance( basename ); final IntList wordsToDump; if ( jsapResult.getObject( "word" ) != null ) { wordsToDump = IntLists.singleton( jsapResult.getInt( "word" ) ); LOGGER.debug( "Dumping word " + wordsToDump ); } else { if ( frequency == 0 && globalFrequency == 0 ) throw new IllegalArgumentException( "You must specify either a word or a frequency range" ); final int min = (int)Math.round( lowFreq * numberOfDocuments ); final int max = (int)Math.round( highFreq * numberOfDocuments ); final long globMin = Math.round( lowGlobFreq * numberOfOccurrences ); final long globMax = Math.round( highGlobFreq * numberOfOccurrences ); if ( frequency != 0 ) LOGGER.debug( "Dumping words in relative frequency range [" + format( lowFreq ) + ", " + format( highFreq ) + "] (" + numberOfDocuments + " documents, frequency range [" + min + ", " + max + "])" ); if ( globalFrequency != 0 ) LOGGER.debug( "Dumping words in relative global count range [" + format( lowGlobFreq ) + ", " + format( highGlobFreq ) + "] (" + numberOfOccurrences + " documents, global count range [" + globMin + ", " + globMax + "])" ); wordsToDump = new IntArrayList(); final InputBitStream frequencies = new InputBitStream( new FileInputStream ( basename + DiskBasedIndex.FREQUENCIES_EXTENSION ) ); final InputBitStream globCounts = new InputBitStream( new FileInputStream ( basename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ) ); int f; long fl; for( int t = 0; t < numberOfTerms; t++ ) { f = frequencies.readGamma(); fl = globCounts.readLongGamma(); if ( frequency != 0 && ( f < min || f > max ) ) continue; if ( globalFrequency != 0 && ( fl < globMin || fl > globMax ) ) continue; wordsToDump.add( t ); } frequencies.close(); globCounts.close(); } LOGGER.debug( "Dumping " + wordsToDump.size() + " words..." ); int j, pointer, numOccs, prevPointer; long start, startOccs; IndexReader indexReader = index.getReader(); for( int i = 0; i < wordsToDump.size(); i++ ) { final int word = wordsToDump.getInt( i ); IndexIterator indexIterator = indexReader.documents( word ); j = indexIterator.frequency(); if ( separators ) System.out.println( "# " + word + " (frequency " + j + ", relative " + (double)j / numberOfDocuments + ")" ); prevPointer = -1; while( j-- != 0 ) { start = indexReader.readBits(); pointer = indexIterator.nextDocument(); if ( pointers ) System.out.println( pointer ); if ( gaps ) System.out.println( pointer - prevPointer - 1 ); if ( gapBits ) System.out.println( indexReader.readBits() - start ); prevPointer = pointer; startOccs = indexReader.readBits(); numOccs = indexIterator.count(); if ( posBits ) System.out.println( indexReader.readBits() - startOccs ); if ( recordPositions ) System.out.println( indexReader.readBits() - start ); if ( counts ) System.out.println( numOccs ); if ( relCounts ) System.out.println( (double)numOccs / size.getInt( pointer ) ); } if ( gaps ) System.out.println( numberOfDocuments - prevPointer ); } LOGGER.debug( " done." ); }*/}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -