📄 scan.java
字号:
final OutputBitStream frequencies = new OutputBitStream( batchBasename + DiskBasedIndex.FREQUENCIES_EXTENSION ); final OutputBitStream globCounts = new OutputBitStream( batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ); if ( indexingIsStandard ) { final OutputBitStream index = new OutputBitStream( batchBasename + DiskBasedIndex.INDEX_EXTENSION ); final OutputBitStream offsets = new OutputBitStream( batchBasename + DiskBasedIndex.OFFSETS_EXTENSION ); ByteArrayPostingList baobs; int maxCount = 0, frequency; long bitLength, postings = 0, prevOffset = 0; offsets.writeGamma( 0 ); for ( int i = 0; i < numTerms; i++ ) { baobs = termMap.get( termArray[ i ] ); frequency = baobs.frequency; baobs.flush(); if ( maxCount < baobs.maxCount ) maxCount = baobs.maxCount; bitLength = baobs.writtenBits(); baobs.align(); postings += frequency; index.writeGamma( frequency - 1 ); // We need special treatment for terms appearing in all documents if ( frequency == documentCount ) baobs.stripPointers( index, bitLength ); else index.write( baobs.buffer, bitLength ); frequencies.writeGamma( frequency ); globCounts.writeLongGamma( baobs.globCount ); offsets.writeLongGamma( index.writtenBits() - prevOffset ); prevOffset = index.writtenBits(); } totPostings += postings; final Properties properties = new Properties(); properties.setProperty( Index.PropertyKeys.DOCUMENTS, documentCount ); properties.setProperty( Index.PropertyKeys.TERMS, numTerms ); properties.setProperty( Index.PropertyKeys.POSTINGS, postings ); properties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount ); properties.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() ); properties.addProperty( Index.PropertyKeys.CODING, "FREQUENCIES:GAMMA" ); properties.addProperty( Index.PropertyKeys.CODING, "POINTERS:DELTA" ); properties.addProperty( Index.PropertyKeys.CODING, "COUNTS:GAMMA" ); properties.addProperty( Index.PropertyKeys.CODING, "POSITIONS:DELTA" ); properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) ); properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences ); properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize ); properties.setProperty( Index.PropertyKeys.SIZE, index.writtenBits() ); if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field ); properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); index.close(); offsets.close(); } else { final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, maxDocInBatch + 1, true, flags ); ByteArrayPostingList baobs; OutputBitStream obs; int maxCount = 0, maxFrequency = 0, frequency, count; // Compute max frequency and allocate position array. for ( ByteArrayPostingList b : termMap.values() ) { b.flush(); b.align(); if ( maxFrequency < b.frequency ) maxFrequency = b.frequency; if ( maxCount < b.maxCount ) maxCount = b.maxCount; } final long[] bitPos = new long[ maxFrequency ]; final int[] pointer = new int[ maxFrequency ]; int[] pos = new int[ maxCount ]; for ( int i = 0; i < numTerms; i++ ) { baobs = termMap.get( termArray[ i ] ); final InputBitStream ibs = new InputBitStream( baobs.buffer ); frequency = baobs.frequency; // This could be much more than the actual frequency in virtual indices // Calculate posting bit positions and corresponding pointers for ( int j = 0; j < frequency; j++ ) { bitPos[ j ] = ibs.readBits(); // Cache bit poisition pointer[ j ] = ibs.readDelta(); // Cache pointer for ( int p = ibs.readGamma() + 1; p-- != 0; ) ibs.readDelta(); // Skip document positions } // Sort stably pointers and positions by increasing pointer GenericSorting.quickSort( 0, frequency, new IntComparator() { public int compare( final int i0, final int i1 ) { final int t = pointer[ i0 ] - pointer[ i1 ]; if ( t != 0 ) return t; final long u = bitPos[ i0 ] - bitPos[ i1 ]; // We need a stable sort return u < 0 ? -1 : u > 0 ? 1 : 0; } }, new Swapper() { public void swap( final int i0, final int i1 ) { final long t = bitPos[ i0 ]; bitPos[ i0 ] = bitPos[ i1 ]; bitPos[ i1 ] = t; final int p = pointer[ i0 ]; pointer[ i0 ] = pointer[ i1 ]; pointer[ i1 ] = p; } } ); int actualFrequency = frequency; // Compute actual frequency for virtual indices if ( indexingIsVirtual ) { actualFrequency = 1; for ( int j = 1; j < frequency; j++ ) if ( pointer[ j ] != pointer[ j - 1 ] ) actualFrequency++; if ( ASSERTS ) { for ( int j = 1; j < frequency; j++ ) { assert pointer[ j ] >= pointer[ j - 1 ]; assert pointer[ j ] != pointer[ j - 1 ] || bitPos[ j ] > bitPos[ j - 1 ]; } } } indexWriter.newInvertedList(); indexWriter.writeFrequency( actualFrequency ); int currPointer; for ( int j = 0; j < frequency; j++ ) { ibs.position( bitPos[ j ] ); obs = indexWriter.newDocumentRecord(); indexWriter.writeDocumentPointer( obs, currPointer = ibs.readDelta() ); if ( ASSERTS ) assert currPointer == pointer[ j ]; count = ibs.readGamma() + 1; pos[ 0 ] = ibs.readDelta(); for ( int p = 1; p < count; p++ ) pos[ p ] = pos[ p - 1 ] + 1 + ibs.readDelta(); if ( indexingIsVirtual ) { while( j < frequency - 1 ) { ibs.position( bitPos[ j + 1 ] ); if ( currPointer != ibs.readDelta() ) break; j++; final int moreCount = ibs.readGamma() + 1; pos = IntArrays.grow( pos, count + moreCount, count ); pos[ count ] = ibs.readDelta(); if ( ASSERTS ) assert pos[ count ] > pos[ count - 1 ]; for ( int p = 1; p < moreCount; p++ ) pos[ count + p ] = pos[ count + p - 1 ] + 1 + ibs.readDelta(); count += moreCount; } if ( maxCount < count ) maxCount = count; } indexWriter.writePositionCount( obs, count ); indexWriter.writeDocumentPositions( obs, pos, 0, count, -1 ); } frequencies.writeGamma( actualFrequency ); globCounts.writeLongGamma( baobs.globCount ); } indexWriter.close(); final Properties properties = indexWriter.properties(); totPostings += properties.getLong( "postings" ); properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) ); properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences ); properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize ); properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() ); if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field ); properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); if ( indexingIsRemapped ) { // We must permute sizes final int[] document = new int[ documentCount ], size = new int[ documentCount ]; final InputBitStream sizes = new InputBitStream( batchBasename + DiskBasedIndex.SIZES_EXTENSION ); for ( int i = 0; i < documentCount; i++ ) { document[ i ] = sizes.readGamma(); size[ i ] = sizes.readGamma(); } GenericSorting.quickSort( 0, documentCount, new IntComparator() { public int compare( int x, int y ) { return document[ x ] - document[ y ]; } }, new Swapper() { public void swap( int x, int y ) { int t = document[ x ]; document[ x ] = document[ y ]; document[ y ] = t; t = size[ x ]; size[ x ] = size[ y ]; size[ y ] = t; } } ); final OutputBitStream permutedSizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION ); for ( int i = 0, d = 0; i < documentCount; i++ ) { while ( d++ < document[ i ] ) permutedSizes.writeGamma( 0 ); permutedSizes.writeGamma( size[ i ] ); } permutedSizes.close(); } } if ( indexingIsVirtual ) { final OutputBitStream sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION ); for ( int i = 0; i < currMaxPos.length; i++ ) sizes.writeGamma( currMaxPos[ i ] ); sizes.close(); IntArrays.fill( currMaxPos, 0 ); } globCounts.close(); frequencies.close(); termMap.clear(); numTerms = 0; totOccurrences += numOccurrences; totDocuments += documentCount; final long result = numOccurrences; numOccurrences = 0; globMaxDocSize = Math.max( maxDocSize, globMaxDocSize ); maxDocSize = documentCount = 0; maxDocInBatch = -1; if ( indexingIsStandard ) cutPoints.add( cutPoints.getInt( cutPoints.size() - 1 ) + documentCount ); batch++; System.gc(); // This is exactly the right time to do collection and compaction. return result; } catch ( IOException e ) { LOGGER.fatal( "I/O Error on batch " + batch ); throw e; } } protected void openSizeBitStream() throws FileNotFoundException { if ( ! indexingIsVirtual ) sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION ); } /** * Runs in parallel a number of instances. */ public static void run( final String basename, final DocumentSequence documentSequence, final TermProcessor termProcessor, final String zipCollectionBasename, final int bufferSize, final int documentsPerBatch, final int[] indexedField, final String renumberingFile, final long logInterval, final String tempDirName ) throws ConfigurationException, IOException { run( basename, documentSequence, termProcessor, zipCollectionBasename, bufferSize, documentsPerBatch, indexedField, null, null, renumberingFile, logInterval, tempDirName ); } /** * Runs in parallel a number of instances. * * <p>This commodity method takes care of instantiating one instance per indexed field, and to * pass the right information to each instance. All options are common to all fields, except for * the number of occurrences in a batch, which can be tuned for each field separately. * * @param basename the index basename. * @param documentSequence a document sequence. * @param termProcessor the term processor for this index. * @param zipCollectionBasename if not <code>null</code>, the basename of a new GZIP'd * collection built using <code>documentSequence</code>. * @param bufferSize the buffer size used in all I/O. * @param documentsPerBatch the number of documents that we should try to put in each segment. * @param indexedField the fields that should be indexed, in increasing order. * @param virtualDocumentResolver the array of virtual document resolvers to be used, parallel * to <code>indexedField</code>: it can safely contain anything (even <code>null</code>) * in correspondence to non-virtual fields, and can safely be <code>null</code> if no fields * are virtual. * @param virtualGap the array of virtual field gaps to be used, parallel to * <code>indexedField</code>: it can safely contain anything in correspondence to non-virtual * fields, and can safely be <code>null</code> if no fields are virtual. * @param mapFile the name of a file containing a map to be applied to document indices. * @param logInterval the minimum time interval between activity logs in milliseconds. * @param tempDirName a directory for temporary files. * @throws IOException * @throws ConfigurationException */ @SuppressWarnings("unchecked") public static void run( final String basename, final DocumentSequence documentSequence, final TermProcessor termProcessor, final String zipCollectionBasename, final int bufferSize, final int documentsPerBatch, final int[] indexedField, final VirtualDocumentResolver[] virtualDocumentResolver, final int[] virtualGap, final String mapFile, final long logInterval, final String tempDirName ) throws ConfigurationException, IOException { int numDocuments = 0; final int numberOfIndexedFields = indexedField.length; if ( numberOfIndexedFields == 0 ) throw new IllegalArgumentException( "You must specify at least one field" ); final DocumentFactory factory = documentSequence.factory(); final File tempDir = tempDirName == null ? null : new File( tempDirName ); for ( int i = 0; i < indexedField.length; i++ ) if ( factory.fieldType( indexedField[ i ] ) == DocumentFactory.FieldType.VIRTUAL && ( virtualDocumentResolver == null || virtualDocumentResolver[ i ] == null ) ) throw new IllegalArgumentException( "No resolver was associated with virtual field " + factory.fieldName( indexedField[ i ] ) ); final int[] map = mapFile != null ? BinIO.loadInts( mapFile ) : null; final Scan[] scan = new Scan[ numberOfIndexedFields ]; // To scan textual content
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -