📄 scan.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
			final OutputBitStream frequencies = new OutputBitStream( batchBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );			final OutputBitStream globCounts = new OutputBitStream( batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );			if ( indexingIsStandard ) {				final OutputBitStream index = new OutputBitStream( batchBasename + DiskBasedIndex.INDEX_EXTENSION );				final OutputBitStream offsets = new OutputBitStream( batchBasename + DiskBasedIndex.OFFSETS_EXTENSION );				ByteArrayPostingList baobs;				int maxCount = 0, frequency;				long bitLength, postings = 0, prevOffset = 0;				offsets.writeGamma( 0 );				for ( int i = 0; i < numTerms; i++ ) {					baobs = termMap.get( termArray[ i ] );					frequency = baobs.frequency;					baobs.flush();					if ( maxCount < baobs.maxCount ) maxCount = baobs.maxCount;					bitLength = baobs.writtenBits();					baobs.align();					postings += frequency;					index.writeGamma( frequency - 1 );						// We need special treatment for terms appearing in all documents					if ( frequency == documentCount ) baobs.stripPointers( index, bitLength );					else index.write( baobs.buffer, bitLength );					frequencies.writeGamma( frequency );					globCounts.writeLongGamma( baobs.globCount );					offsets.writeLongGamma( index.writtenBits() - prevOffset );					prevOffset = index.writtenBits();				}				totPostings += postings;				final Properties properties = new Properties();				properties.setProperty( Index.PropertyKeys.DOCUMENTS, documentCount );				properties.setProperty( Index.PropertyKeys.TERMS, numTerms );				properties.setProperty( Index.PropertyKeys.POSTINGS, postings );				properties.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );				properties.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() );				properties.addProperty( Index.PropertyKeys.CODING, "FREQUENCIES:GAMMA" );				properties.addProperty( Index.PropertyKeys.CODING, "POINTERS:DELTA" );				properties.addProperty( Index.PropertyKeys.CODING, "COUNTS:GAMMA" );				properties.addProperty( Index.PropertyKeys.CODING, "POSITIONS:DELTA" );				properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );				properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );				properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );				properties.setProperty( Index.PropertyKeys.SIZE, index.writtenBits() );				if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );				properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );				index.close();				offsets.close();			}			else {				final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, maxDocInBatch + 1, true, flags );				ByteArrayPostingList baobs;				OutputBitStream obs;				int maxCount = 0, maxFrequency = 0, frequency, count;				// Compute max frequency and allocate position array.				for ( ByteArrayPostingList b : termMap.values() ) {					b.flush();					b.align();					if ( maxFrequency < b.frequency ) maxFrequency = b.frequency;					if ( maxCount < b.maxCount ) maxCount = b.maxCount;				}				final long[] bitPos = new long[ maxFrequency ];				final int[] pointer = new int[ maxFrequency ];				int[] pos = new int[ maxCount ];				for ( int i = 0; i < numTerms; i++ ) {					baobs = termMap.get( termArray[ i ] );					final InputBitStream ibs = new InputBitStream( baobs.buffer );					frequency = baobs.frequency; // This could be much more than the actual frequency in virtual indices					// Calculate posting bit positions and corresponding pointers					for ( int j = 0; j < frequency; j++ ) {						bitPos[ j ] = ibs.readBits(); // Cache bit poisition						pointer[ j ] = ibs.readDelta(); // Cache pointer						for ( int p = ibs.readGamma() + 1; p-- != 0; ) ibs.readDelta(); // Skip document positions					}					// Sort stably pointers and positions by increasing pointer					GenericSorting.quickSort( 0, frequency, new IntComparator() {						public int compare( final int i0, final int i1 ) {							final int t = pointer[ i0 ] - pointer[ i1 ];							if ( t != 0 ) return t;							final long u = bitPos[ i0 ] - bitPos[ i1 ]; // We need a stable sort							return u < 0 ? -1 : u > 0 ? 1 : 0;						}					},					new Swapper() {						public void swap( final int i0, final int i1 ) {							final long t = bitPos[ i0 ]; bitPos[ i0 ] = bitPos[ i1 ]; bitPos[ i1 ] = t;							final int p = pointer[ i0 ]; pointer[ i0 ] = pointer[ i1 ]; pointer[ i1 ] = p;						}					} );					int actualFrequency = frequency;					// Compute actual frequency for virtual indices					if ( indexingIsVirtual ) {						actualFrequency = 1;						for ( int j = 1; j < frequency; j++ ) if ( pointer[ j ] != pointer[ j - 1 ] ) actualFrequency++;						if ( ASSERTS ) {							for ( int j = 1; j < frequency; j++ ) {								assert pointer[ j ] >= pointer[ j - 1 ];								assert pointer[ j ] != pointer[ j - 1 ] || bitPos[ j ] > bitPos[ j - 1 ];							}						}					}					indexWriter.newInvertedList();					indexWriter.writeFrequency( actualFrequency );					int currPointer;					for ( int j = 0; j < frequency; j++ ) {						ibs.position( bitPos[ j ] );						obs = indexWriter.newDocumentRecord();						indexWriter.writeDocumentPointer( obs, currPointer = ibs.readDelta() );						if ( ASSERTS ) assert currPointer == pointer[ j ];						count = ibs.readGamma() + 1;						pos[ 0 ] = ibs.readDelta();						for ( int p = 1; p < count; p++ ) pos[ p ] = pos[ p - 1 ] + 1 + ibs.readDelta();						if ( indexingIsVirtual ) {							while( j < frequency - 1 ) {								ibs.position( bitPos[ j + 1 ] );								if ( currPointer != ibs.readDelta() ) break;								j++;								final int moreCount = ibs.readGamma() + 1;								pos = IntArrays.grow( pos, count + moreCount, count );								pos[ count ] = ibs.readDelta();								if ( ASSERTS ) assert pos[ count ] > pos[ count - 1 ];								for ( int p = 1; p < moreCount; p++ ) pos[ count + p ] = pos[ count + p - 1 ] + 1 + ibs.readDelta();								count += moreCount;							}							if ( maxCount < count ) maxCount = count;						}						indexWriter.writePositionCount( obs, count );						indexWriter.writeDocumentPositions( obs, pos, 0, count, -1 );					}					frequencies.writeGamma( actualFrequency );					globCounts.writeLongGamma( baobs.globCount );				}				indexWriter.close();				final Properties properties = indexWriter.properties();				totPostings += properties.getLong( "postings" );				properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );				properties.setProperty( Index.PropertyKeys.OCCURRENCES, numOccurrences );				properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize );				properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() );				if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );				properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );				if ( indexingIsRemapped ) {					// We must permute sizes					final int[] document = new int[ documentCount ], size = new int[ documentCount ];					final InputBitStream sizes = new InputBitStream( batchBasename + DiskBasedIndex.SIZES_EXTENSION );					for ( int i = 0; i < documentCount; i++ ) {						document[ i ] = sizes.readGamma();						size[ i ] = sizes.readGamma();					}					GenericSorting.quickSort( 0, documentCount, new IntComparator() {						public int compare( int x, int y ) {							return document[ x ] - document[ y ];						}					}, new Swapper() {						public void swap( int x, int y ) {							int t = document[ x ];							document[ x ] = document[ y ];							document[ y ] = t;							t = size[ x ];							size[ x ] = size[ y ];							size[ y ] = t;						}					} );					final OutputBitStream permutedSizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );					for ( int i = 0, d = 0; i < documentCount; i++ ) {						while ( d++ < document[ i ] )							permutedSizes.writeGamma( 0 );						permutedSizes.writeGamma( size[ i ] );					}					permutedSizes.close();				}			}						if ( indexingIsVirtual ) {				final OutputBitStream sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );				for ( int i = 0; i < currMaxPos.length; i++ ) sizes.writeGamma( currMaxPos[ i ] );				sizes.close();				IntArrays.fill( currMaxPos, 0 );			}			globCounts.close();			frequencies.close();			termMap.clear();			numTerms = 0;			totOccurrences += numOccurrences;			totDocuments += documentCount;			final long result = numOccurrences;			numOccurrences = 0;			globMaxDocSize = Math.max( maxDocSize, globMaxDocSize );			maxDocSize = documentCount = 0;			maxDocInBatch = -1;			if ( indexingIsStandard ) cutPoints.add( cutPoints.getInt( cutPoints.size() - 1 ) + documentCount );			batch++;			System.gc(); // This is exactly the right time to do collection and compaction.			return result;		}		catch ( IOException e ) {			LOGGER.fatal( "I/O Error on batch " + batch );			throw e;		}	}	protected void openSizeBitStream() throws FileNotFoundException {		if ( ! indexingIsVirtual ) sizes = new OutputBitStream( batchBasename( batch, basename, batchDir ) + DiskBasedIndex.SIZES_EXTENSION );	}	/**	 * Runs in parallel a number of instances.	 */	public static void run( final String basename, final DocumentSequence documentSequence, final TermProcessor termProcessor, final String zipCollectionBasename, final int bufferSize,			final int documentsPerBatch, final int[] indexedField, final String renumberingFile, final long logInterval, final String tempDirName )			throws ConfigurationException, IOException {		run( basename, documentSequence, termProcessor, zipCollectionBasename, bufferSize, documentsPerBatch, indexedField, null, null, renumberingFile, logInterval, tempDirName );	}	/**	 * Runs in parallel a number of instances.	 * 	 * <p>This commodity method takes care of instantiating one instance per indexed field, and to	 * pass the right information to each instance. All options are common to all fields, except for	 * the number of occurrences in a batch, which can be tuned for each field separately.	 * 	 * @param basename the index basename.	 * @param documentSequence a document sequence.	 * @param termProcessor the term processor for this index.	 * @param zipCollectionBasename if not <code>null</code>, the basename of a new GZIP'd	 * collection built using <code>documentSequence</code>.	 * @param bufferSize the buffer size used in all I/O.	 * @param documentsPerBatch the number of documents that we should try to put in each segment.	 * @param indexedField the fields that should be indexed, in increasing order.	 * @param virtualDocumentResolver the array of virtual document resolvers to be used, parallel	 * to <code>indexedField</code>: it can safely contain anything (even <code>null</code>)	 * in correspondence to non-virtual fields, and can safely be <code>null</code> if no fields	 * are virtual.	 * @param virtualGap the array of virtual field gaps to be used, parallel to	 * <code>indexedField</code>: it can safely contain anything in correspondence to non-virtual	 * fields, and can safely be <code>null</code> if no fields are virtual.	 * @param mapFile the name of a file containing a map to be applied to document indices.	 * @param logInterval the minimum time interval between activity logs in milliseconds.	 * @param tempDirName a directory for temporary files.	 * @throws IOException	 * @throws ConfigurationException	 */	@SuppressWarnings("unchecked")	public static void run( final String basename, final DocumentSequence documentSequence, final TermProcessor termProcessor, final String zipCollectionBasename, final int bufferSize,			final int documentsPerBatch, final int[] indexedField, final VirtualDocumentResolver[] virtualDocumentResolver, final int[] virtualGap, final String mapFile,			final long logInterval, final String tempDirName ) throws ConfigurationException, IOException {		int numDocuments = 0;		final int numberOfIndexedFields = indexedField.length;		if ( numberOfIndexedFields == 0 ) throw new IllegalArgumentException( "You must specify at least one field" );		final DocumentFactory factory = documentSequence.factory();		final File tempDir = tempDirName == null ? null : new File( tempDirName );		for ( int i = 0; i < indexedField.length; i++ )			if ( factory.fieldType( indexedField[ i ] ) == DocumentFactory.FieldType.VIRTUAL && ( virtualDocumentResolver == null || virtualDocumentResolver[ i ] == null ) ) throw new IllegalArgumentException(					"No resolver was associated with virtual field " + factory.fieldName( indexedField[ i ] ) );		final int[] map = mapFile != null ? BinIO.loadInts( mapFile ) : null;		final Scan[] scan = new Scan[ numberOfIndexedFields ]; // To scan textual content
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -