📄 indextest.java
字号:
/** * Checks that the fields indexed by the given indices have been indexed correctly by performing * a mock index construction over the given sequence. * * @param sequence a document sequence. * @param resolver the virtual document resolver used to index the collection (we assume the * same for all virtual fields), or <code>null</code>. * @param gap the virtual document gap (we assume the same for all virtual fields; it is * immaterial if no field is virtual). * @param index a list of indices that have indexed one or more fields of <code>sequence</code>. */ @SuppressWarnings("unchecked") public void checkAgainstContent( DocumentSequence sequence, int[] map, VirtualDocumentResolver resolver, int gap, Index... index ) throws IOException { DocumentIterator iterator = sequence.iterator(); DocumentFactory factory = sequence.factory(); Document document; final int n = index.length; final int[] field = new int[ n ]; final int[][] currMaxPos = new int[ n ][]; final int[] maxDoc = new int[ n ]; IntArrays.fill( maxDoc, -1 ); final Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>[] termMap = new Object2ObjectOpenHashMap[ n ]; final IntArrayList[] payloadPointers = new IntArrayList[ n ]; final ObjectArrayList<Object>[] payloadContent = new ObjectArrayList[ n ]; for ( int i = 0; i < n; i++ ) { field[ i ] = factory.fieldIndex( index[ i ].field ); switch ( factory.fieldType( field[ i ] ) ) { case VIRTUAL: currMaxPos[ i ] = new int[ resolver.numberOfDocuments() ]; case TEXT: termMap[ i ] = new Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>(); break; case DATE: case INT: payloadPointers[ i ] = new IntArrayList(); payloadContent[ i ] = new ObjectArrayList<Object>(); } } int documentIndex = 0; while ( ( document = iterator.nextDocument() ) != null ) { for ( int i = 0; i < field.length; i++ ) { switch ( factory.fieldType( field[ i ] ) ) { case TEXT: processDocument( document.wordReader( field[ i ] ).setReader( (Reader)document.content( field[ i ] ) ), map == null ? documentIndex : map[ documentIndex ], 0, termMap[ i ], index[ i ].termProcessor ); break; case VIRTUAL: ObjectArrayList<VirtualDocumentFragment> fragments = (ObjectArrayList<VirtualDocumentFragment>)document.content( field[ i ] ); resolver.context( document ); for ( VirtualDocumentFragment fragment : fragments ) { int d = resolver.resolve( fragment.documentSpecifier() ); if ( d != -1 ) { if ( map != null ) d = map[ d ]; if ( maxDoc[ i ] < d ) maxDoc[ i ] = d; currMaxPos[ i ][ d ] = processDocument( document.wordReader( field[ i ] ).setReader( new FastBufferedReader( fragment.text() ) ), d, currMaxPos[ i ][ d ], termMap[ i ], index[ i ].termProcessor ) + gap; } } break; case INT: case DATE: Object x = document.content( field[ i ] ); if ( x != null ) { payloadPointers[ i ].add( map == null ? documentIndex : map[ documentIndex ] ); payloadContent[ i ].add( x ); } default: } } document.close(); documentIndex++; } iterator.close(); for ( int i = 0; i < n; i++ ) { if ( termMap[ i ] != null ) for ( ObjectArrayList<int[]> list : termMap[ i ].values() ) { // We sort in all cases, just to reduce the possible execution paths Collections.sort( list, new Comparator<int[]>() { public int compare( int[] p0, int[] p1 ) { return p0[ 0 ] - p1[ 0 ]; } } ); switch ( factory.fieldType( field[ i ] ) ) { case VIRTUAL: // We coalesce the list ObjectArrayList<int[]> newList = new ObjectArrayList<int[]>(); for ( int k = 0; k < list.size(); ) { int s; for ( s = k + 1; s < list.size(); s++ ) if ( list.get( k )[ 0 ] != list.get( s )[ 0 ] ) break; int count = 0; for ( int t = k; t < s; t++ ) count += list.get( t ).length - 1; int[] posting = new int[ count + 1 ]; posting[ 0 ] = list.get( k )[ 0 ]; count = 1; for ( int t = k; t < s; t++ ) { System.arraycopy( list.get( t ), 1, posting, count, list.get( t ).length - 1 ); count += list.get( t ).length - 1; } k = s; newList.add( posting ); } list.clear(); list.addAll( newList ); break; default: } } if ( payloadPointers[ i ] != null ) { final int p[] = payloadPointers[ i ].elements(); final Object[] b = payloadContent[ i ].elements(); GenericSorting.quickSort( 0, payloadPointers[ i ].size(), new IntComparator() { public int compare( int i0, int i1 ) { return p[ i0 ] - p[ i1 ]; } }, new Swapper() { public void swap( int i0, int i1 ) { final int t = p[ i0 ]; p[ i0 ] = p[ i1 ]; p[ i1 ] = t; final Object o = b[ i0 ]; b[ i0 ] = b[ i1 ]; b[ i1 ] = o; } } ); } } for ( int i = 0; i < n; i++ ) { assertEquals( index[ i ].toString(), factory.fieldType( field[ i ] ) == FieldType.VIRTUAL ? maxDoc[ i ] + 1 : documentIndex, index[ i ].numberOfDocuments ); switch ( factory.fieldType( field[ i ] ) ) { case TEXT: case VIRTUAL: assertEquals( termMap[ i ].size(), index[ i ].numberOfTerms ); int postings = 0, occurrences = 0; for ( ObjectArrayList<int[]> l : termMap[ i ].values() ) { postings += l.size(); for ( int[] p : l ) occurrences += p.length - 1; } assertEquals( index[ i ].toString(), postings, index[ i ].numberOfPostings ); assertEquals( occurrences, index[ i ].numberOfOccurrences ); IndexReader indexReader = index[ i ].getReader(); for ( MutableString term : new ObjectRBTreeSet<MutableString>( termMap[ i ].keySet() ).toArray( new MutableString[ termMap[ i ].size() ] ) ) { String msg = index[ i ] + ":" + term; IndexIterator indexIterator = indexReader.documents( term ); ObjectArrayList<int[]> list = termMap[ i ].get( term ); int k = 0; while ( indexIterator.hasNext() ) { assertEquals( msg, list.get( k )[ 0 ], indexIterator.nextDocument() ); // Document // pointer assertEquals( msg, list.get( k ).length - 1, indexIterator.count() ); // Count final int[] position = indexIterator.positionArray(); for ( int p = 0; p < indexIterator.count(); p++ ) assertEquals( msg, list.get( k )[ p + 1 ], position[ p ] ); // Positions k++; } assertEquals( k, list.size() ); // This implicitly checks the frequency } indexReader.close(); break; case INT: case DATE: assertEquals( index[ i ].toString(), payloadPointers[ i ].size(), index[ i ].numberOfPostings ); assertEquals( index[ i ].toString(), documentIndex != 0 ? 1 : 0, index[ i ].numberOfTerms ); assertEquals( index[ i ].toString(), -1, index[ i ].numberOfOccurrences ); if ( documentIndex != 0 ) { IndexIterator indexIterator = index[ i ].documents( 0 ); int k = 0; while ( indexIterator.hasNext() ) { assertEquals( payloadPointers[ i ].getInt( k ), indexIterator.nextDocument() ); if ( factory.fieldType( field[ i ] ) == FieldType.INT ) assertEquals( ( (Number)payloadContent[ i ].get( k ) ).longValue(), ( (Number)indexIterator.payload().get() ) .longValue() ); else assertEquals( payloadContent[ i ].get( k ), indexIterator.payload().get() ); k++; } indexIterator.dispose(); assertEquals( k, payloadContent[ i ].size() ); } } } } public void setUp() throws IOException { basename = File.createTempFile( this.getClass().getSimpleName(), "test" ).getCanonicalPath(); } public void tearDown() throws IOException { for ( Object f : FileUtils.listFiles( new File( basename ).getParentFile(), FileFilterUtils.prefixFileFilter( this.getClass().getSimpleName() ), null ) ) ( (File)f ).delete(); if ( lastSequence != null ) lastSequence.close(); } // We keep track of the last returned sequence to close it without cluttering the test code private DocumentSequence lastSequence; public DocumentSequence getSequence() throws ConfigurationException, IOException { if ( lastSequence != null ) lastSequence.close(); return lastSequence = new CompositeDocumentSequence( new InputStreamDocumentSequence( this.getClass().getResourceAsStream( "documents.data" ), 10, new IdentityDocumentFactory( new String[] { "encoding=UTF-8" } ), NUMBER_OF_DOCUMENTS ), new IntArrayDocumentCollection( INTEGER_DOCUMENT ), new DateArrayDocumentCollection( DATE_DOCUMENT ), new MapVirtualDocumentCollection( VIRTUAL_DOCUMENT ) ); } @SuppressWarnings("unchecked") public DocumentSequence getEmptySequence() throws ConfigurationException, IOException { if ( lastSequence != null ) lastSequence.close(); return lastSequence = new CompositeDocumentSequence( new StringArrayDocumentCollection(), new IntArrayDocumentCollection(), new DateArrayDocumentCollection(), new MapVirtualDocumentCollection() ); } public void testIndex( int quantum, int height, TermProcessor termProcessor ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { final boolean interleaved = quantum >= 0; if ( !interleaved ) quantum = -quantum; // Vanilla indexing new IndexBuilder( basename, getSequence() ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ) .height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run(); checkAgainstContent( getSequence(), null, RESOLVER, Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP, Index.getInstance( basename + "-text" ), Index.getInstance( basename + "-int" ), Index .getInstance( basename + "-date" ), Index.getInstance( basename + "-virtual" ) ); final String basenameZipped = basename + "-zipped"; // Vanilla indexing with zipped collection new IndexBuilder( basename, getSequence() ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ) .height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).zipCollectionBasename( basenameZipped ).run(); // Vanilla indexing with zipped collection new IndexBuilder( basenameZipped, (DocumentSequence)BinIO.loadObject( basenameZipped + ".collection" ) ).termProcessor( termProcessor ).indexedFields( 0, 1, 2, 3 ).skipBufferSize( 1024 ) .pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run(); // The two indices must be byte-by-byte identical (and we keep the zipped index for future // reference) sameIndex( basename + "-text", basenameZipped + "-text" ); sameIndex( basename + "-int", basenameZipped + "-int", "batches" ); sameIndex( basename + "-date", basenameZipped + "-date", "batches" ); sameIndex( basename + "-virtual", basenameZipped + "-virtual", "batches" ); // Indexing with just one batch new IndexBuilder( basename + "-onebatch", getSequence() ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ) .quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( NUMBER_OF_DOCUMENTS ).run(); // The two indices must be byte-by-byte identical sameIndex( basename + "-text", basename + "-onebatch-text", "batches" ); sameIndex( basename + "-int", basename + "-onebatch-int", "batches" ); sameIndex( basename + "-date", basename + "-onebatch-date", "batches" ); sameIndex( basename + "-virtual", basename + "-onebatch-virtual", "batches" ); } public void testIndex( int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { testIndex( quantum, height, DowncaseTermProcessor.getInstance() ); } public void testIndex() throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { testIndex( 0, 0 ); testIndex( 0, 0, KILL_A_PROCESSOR ); testIndex( 1, 1 ); testIndex( 1, 2 ); testIndex( 4, 1 ); testIndex( 4, 4 );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -