📄 verifier.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.test;import it.unimi.dsi.fastutil.ints.Int2IntMap;import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;import it.unimi.dsi.fastutil.ints.IntAVLTreeSet;import it.unimi.dsi.fastutil.ints.IntArrayList;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.ints.IntIterators;import it.unimi.dsi.fastutil.ints.IntLinkedOpenHashSet;import it.unimi.dsi.fastutil.ints.IntOpenHashSet;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.document.DocumentSequence;import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.IndexReader;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.mg4j.search.AndDocumentIterator;import it.unimi.dsi.mg4j.search.OrDocumentIterator;import it.unimi.dsi.Util;import it.unimi.dsi.mg4j.util.MG4JClassParser;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import java.io.File;import java.io.Reader;import java.util.Arrays;import org.apache.commons.lang.ArrayUtils;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** Verifies that an index matches a collection. */final public class Verifier {	private static final Logger LOGGER = Util.getLogger( Verifier.class );		private Verifier() {}	@SuppressWarnings("unchecked")	public static void main( final String[] arg ) throws Throwable {		SimpleJSAP jsap = new SimpleJSAP( Verifier.class.getName(), "Scans an index and associated files, checking internal coherence. Optionally, compares the index with a document sequence.",				new Parameter[] {					new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'S', "sequence", "A serialised document sequence that will be used instead of stdin." ),					new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ),					new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ),					new FlaggedOption( "indexedField", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'I', "indexed-field", "The field(s) of the document factory that will be indexed. (default: all fields)" ).setAllowMultipleDeclarations( true ),					new Switch( "allFields", 'a', "all-fields", "Index also all virtual fields; has no effect if indexedField has been used at least once." ),					new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( it.unimi.dsi.mg4j.tool.Scan.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),					new FlaggedOption( "delimiter", JSAP.INTEGER_PARSER, Integer.toString( it.unimi.dsi.mg4j.tool.Scan.DEFAULT_DELIMITER ), JSAP.NOT_REQUIRED, 'd', "delimiter", "The document delimiter." ),					new FlaggedOption( "renumber", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'r', "renumber", "The filename of a document renumbering." ),					new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),					new Switch( "termLists", 't', "term-lists", "Instead of assuming each index knows its terms, read a term file stemmed from the index name." ),					new Switch( "stem", 's', "stem", "Stem basename using field names from the collection." ),					new Switch( "random", 'R', "random", "Perform random access checks; requires a collection (will use stdin if none is specified)." ),					new Switch( "virtual", JSAP.NO_SHORTFLAG, "virtual", "Virtual collection; skip document size/occurrences check and random access check." ),					new Switch( "noSeq", JSAP.NO_SHORTFLAG, "no-seq", "Skip sequential check." ),					new Switch( "noSkip", JSAP.NO_SHORTFLAG, "no-skip", "Skip \"all-skips\" check." ),					new Switch( "noComp", JSAP.NO_SHORTFLAG, "no-comp", "Skip composite iterator check." ),					new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the index." )			});				JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;				DocumentSequence documentSequence = it.unimi.dsi.mg4j.tool.Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );				final DocumentFactory factory = documentSequence.factory();		final boolean stem = jsapResult.getBoolean( "stem" );		final boolean termLists = jsapResult.getBoolean( "termLists" );		final int[] indexedField = it.unimi.dsi.mg4j.tool.Scan.parseFieldNames( jsapResult.getStringArray( "indexedField" ), factory, jsapResult.getBoolean( "allFields" )  );				LOGGER.debug( "Parsed indexed field: " + IntArrayList.wrap( indexedField ) );				final String basename = jsapResult.getString( "basename" ); 		final String permutationFile = jsapResult.getString( "renumber" );		// A map from terms to 0-based term indices.		int i, t = 0;		final ProgressLogger pl = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "ints" );		final Index[] index = stem ? new Index[ indexedField.length ] : new Index[ 1 ];		final int numberOfTerms[] = new int[ indexedField.length ];		final ObjectArrayList<MutableString>[] terms = new ObjectArrayList[ indexedField.length ];		final IndexReader[] indexReader = new IndexReader[ index.length ];		final InputBitStream[] frequencies = new InputBitStream[ index.length ];		final int[][] count = new int[ index.length ][];		final int[] permutation = permutationFile != null ? BinIO.loadInts( permutationFile ) : null;		final int[][] occ = new int[ index.length ][];		final int[][] wordInPos = new int[ index.length ][];		final Int2IntMap[] termsInDoc = new Int2IntOpenHashMap[ index.length ];		int totalTerms = 0;				boolean allBitStreamIndices = true;				for( i = 0; i < index.length; i++ ) {			final String basenameField = basename + (stem ? "-" + factory.fieldName( indexedField[ i ] ) : "" );			index[ i ] = Index.getInstance( basenameField );			if ( ! ( index[ i ] instanceof BitStreamIndex ) ) allBitStreamIndices = false;						if ( termLists ) {				terms[ i ] = new ObjectArrayList<MutableString>( new FileLinesCollection( basenameField + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).allLines() );				numberOfTerms[ i ] = terms[ i ].size();			}			else numberOfTerms[ i ] = index[ i ].numberOfTerms;			totalTerms += numberOfTerms[ i ];						// This will be matched with the number of occurrences per document			count[ i ] = new int[ index[ i ].numberOfDocuments ];			occ[ i ] = index[ i ].maxCount > 0 ? new int[ index[ i ].maxCount ] : IntArrays.EMPTY_ARRAY;			wordInPos[ i ] = new int[ Math.max( 0, index[ i ].properties.getInt( Index.PropertyKeys.MAXDOCSIZE ) ) ];			indexReader[ i ] = index[ i ].getReader();						if ( new File( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION ).exists() ) frequencies[ i ] = new InputBitStream( basenameField + DiskBasedIndex.FREQUENCIES_EXTENSION );			termsInDoc[ i ] = new Int2IntOpenHashMap();		}		int currDoc = 0,		// Term position in the current document.		pos = 0, f = 0, p;		pl.itemsName = "lists";		pl.expectedUpdates = totalTerms;				int indexFrequency = -1;				// Sequential scan		if ( !jsapResult.getBoolean( "noSeq" ) ) {			try {				for ( i = 0; i < index.length; i++ ) {					int numberOfPostings = 0;					pl.expectedUpdates = numberOfTerms[ i ];					pl.start( "Verifying sequentially index " + index[ i ] + "..." );					if ( allBitStreamIndices ) {						for ( t = 0; t < numberOfTerms[ i ]; t++ ) {							pl.update();							IndexIterator indexIterator = indexReader[ i ].nextIterator();							indexFrequency = indexIterator.frequency();							numberOfPostings += indexFrequency;							if ( frequencies[ i ] != null && indexFrequency != ( f = frequencies[ i ].readGamma() ) ) {								System.err.println( "Error in frequency for term " + t + ": expected " + f + " documents, found " + indexFrequency );								return;							}							while ( indexFrequency-- != 0 ) {								p = indexIterator.nextDocument();								if (index[i].hasCounts) count[i][p] += indexIterator.count();								if (index[i].hasPositions) indexIterator.positionArray(); // Just to force reading in high-performance indices							}							if ( indexIterator.nextDocument() != -1 ) throw new AssertionError( "nextDocument() is not -1 after exhaustive iteration" );						}												// Check document sizes						if ( ! jsapResult.getBoolean( "virtual" ) && ( (BitStreamIndex) index[ i ] ).sizes != null && index[ i ].hasCounts )							for ( p = 0; p < index[ i ].numberOfDocuments; p++ )								if ( index[ i ].sizes.getInt( p ) != count[ i ][ p ] )									System.err.println( "Document " + p + " has size " + ( (BitStreamIndex) index[ i ] ).sizes.getInt( p ) + " but " + count[ i ][ p ] + " occurrences have been stored." );											}					else { // Non-bitstream indices						for (t = 0; t < numberOfTerms[ i ]; t++) {							pl.update();							IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );							indexFrequency = indexIterator.frequency();							numberOfPostings += indexFrequency;							if (frequencies[i] != null && indexFrequency != (f = frequencies[i].readGamma())) {								System.err.println("Error in frequency for term " + t										+ ": expected " + f + " documents, found "										+ indexFrequency);								return;							}														int prevp = -1;							while (indexFrequency-- != 0) {								p = indexIterator.nextDocument();								assert prevp < p : "previous pointer: " + prevp + "; current pointer: " + p;								prevp = p;								if (index[i].hasCounts) count[i][p] += indexIterator.count();							}						}					}					pl.done();										if ( numberOfPostings != index[ i ].numberOfPostings ) System.err.println( "Index declares " + index[ i ].numberOfPostings + " postings, but we found " + numberOfPostings );					long numberOfOccurrences = 0;					if ( index[ i ].hasCounts ) {						for ( p = 0; p < index[ i ].numberOfDocuments; p++ ) numberOfOccurrences += count[ i ][ p ];						if ( numberOfOccurrences != index[ i ].numberOfOccurrences ) System.err.println( "Index declares " + index[ i ].numberOfOccurrences + " occurrences, but we found " + numberOfOccurrences );					}				}			} catch ( Exception e ) {				System.err.println( "Exception while scanning sequentially term " + t + " of index " + index[ i ] );				System.err.println( "Term frequency was " + f + " and position " + ( f - indexFrequency - 1 ) );				throw e;			}		}			IntArrayList l = new IntArrayList();		ObjectArrayList<int[]> positions = new ObjectArrayList<int[]>();				if ( ! jsapResult.getBoolean( "noSkip" ) ) {			int start = 0, end = 0, result;			try {				for (i = 0; i < index.length; i++) {										pl.expectedUpdates = numberOfTerms[ i ];					pl.start("Verifying all skips in " + index[i] + "...");					for (t = 0; t < numberOfTerms[ i ]; t++) {						l.clear();						positions.clear();						IndexIterator documents = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );						int d;						while( ( d = documents.nextDocument() ) != -1 ) {							l.add( d );							if ( index[ i ].hasPositions ) positions.add( ArrayUtils.subarray( documents.positionArray(), 0, documents.count() ) );						}												for( start = 0; start < l.size(); start++ ) {							for( end = start + 1; end < l.size(); end++ ) {								IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );																result = indexIterator.skipTo( l.getInt( start ) );								assert indexIterator.document() == l.getInt( start ) && result == l.getInt( start ): "Trying to skip to document " + l.getInt( start ) + " (term " + t + ") moved to " + indexIterator.document() + "(skipTo() returned " + result + ")";								result = indexIterator.skipTo( l.getInt( end ) );								assert indexIterator.document() == l.getInt( end ) && result == l.getInt( end ): "Trying to skip to document " + l.getInt( end ) + " (term " + t + ") after a skip to " + start + " moved to " + indexIterator.document() + "(skipTo() returned " + result + ")";																if ( index[ i ].hasPositions ) {									// This catches wrong state reconstruction after skips.									indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );									indexIterator.skipTo( l.getInt( start ) );									assert indexIterator.document() == l.getInt( start ) : indexIterator.document() + " != " + l.getInt( start );									assert indexIterator.count() == positions.get( start ).length: indexIterator.count() + " != " + positions.get( start ).length;									assert Arrays.equals( positions.get( start ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )										: Arrays.toString( positions.get( start ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) );									indexIterator.skipTo( l.getInt( end ) );									assert indexIterator.document() == l.getInt( end ) : indexIterator.document() + " != " + l.getInt( end );									assert indexIterator.count() == positions.get( end ).length: indexIterator.count() + " != " + positions.get( end ).length;									assert Arrays.equals( positions.get( end ), ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) )										: Arrays.toString( positions.get( end ) ) + "!=" + Arrays.toString( ArrayUtils.subarray( indexIterator.positionArray(), 0, indexIterator.count() ) );								}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -