📄 combine.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
			final int bufferSize,			final Map<Component,Coding> writerFlags,			boolean interleaved,			final boolean skips,			final int quantum,			final int height,			final int skipBufferSize,			final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {		this.logInterval = logInterval;		LOGGER.debug( "Combining indices " + Arrays.toString( inputBasename ) + " into " + outputBasename );				this.inputBasename = inputBasename;		this.outputBasename = outputBasename;		this.metadataOnly = metadataOnly;		this.bufferSize = bufferSize;		numIndices = inputBasename.length;		index = new BitStreamIndex[ numIndices ];		indexReader = new IndexReader[ numIndices ];		indexIterator = new IndexIterator[ numIndices ];		globCounts = new InputBitStream[ numIndices ];		term = new MutableString[ numIndices ];		termReader = new FastBufferedReader[ numIndices ];		termQueue = new ObjectHeapSemiIndirectPriorityQueue<MutableString>( term, numIndices );				// This will remain set if *all* indices to be merged agree		boolean haveCounts = true, havePositions = true;		/* This will be set if *all* indices to be merged agree. Moreover, if some		 * indices disagree we will emit a warning. */		TermProcessor termProcessor = null;		/* This will be set if *all* indices to be merged agree. Moreover, if some		 * indices disagree we will emit a warning. */		Payload payload = null;		String field = null;		writeGlobCounts = writeSizes = true;		boolean someGlobCounts = false, someSizes = false;				for( int i = 0; i < numIndices; i++ ) {			index[ i ] = getIndex( inputBasename[ i ] );			if ( i == 0 ) {				termProcessor = index[ 0 ].termProcessor.copy();				payload = index[ 0 ].payload == null ? null : index[ 0 ].payload.copy();			}			else {				if ( ! termProcessor.equals( index[ i ].termProcessor ) ) throw new IllegalStateException( "The term processor of the first index (" + termProcessor + ") is different from the term processor of index " + i + " (" + index[ i ].termProcessor + ")" );				if ( ( payload == null ) != ( index[ i ].payload == null ) || payload != null && ! payload.compatibleWith( index[ i ].payload ) ) throw new IllegalStateException( "The payload specification of index " + index[ 0 ] + " is not compatible with that of index " + index[ i ] );			}			if ( index[ i ].field != null ) {				if ( field == null ) {					if ( i != 0 ) LOGGER.warn( "Not all indices specify the field property" );					field = index[ i ].field;				}				else if ( ! field.equals( index[ i ].field ) ) LOGGER.warn( "Index fields disagree: \"" + field + "\", \"" + index[ i ].field + "\"" );			}			haveCounts &= index[ i ].hasCounts;			havePositions &= index[ i ].hasPositions;			maxCount = Math.max( maxCount, index[ i ].maxCount );			if ( ! metadataOnly ) indexReader[ i ] = index[ i ].getReader( bufferSize );			if ( index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES ) == -1 ) numberOfOccurrences = -1;			if ( numberOfOccurrences != -1 ) numberOfOccurrences += index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES );			final File globCountsFile = new File( inputBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );			writeGlobCounts &= globCountsFile.exists();			someGlobCounts |= globCountsFile.exists();			if ( writeGlobCounts ) globCounts[ i ] = new InputBitStream( globCountsFile );			final File sizesFile = new File( inputBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );			writeSizes &= sizesFile.exists();			someSizes |= sizesFile.exists();			term[ i ] = new MutableString();			termReader[ i ] = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );			if ( termReader[ i ].readLine( term[ i ] ) != null ) termQueue.enqueue( i ); // If the term list is nonempty, we enqueue it		}		if ( writeGlobCounts != someGlobCounts ) LOGGER.warn(  "Some (but not all) global-counts file missing" );		if ( writeSizes != someSizes ) LOGGER.warn(  "Some (but not all) sizes file missing" );				additionalProperties = new Properties();		additionalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );		if ( payload != null ) {			additionalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );			//writerFlags.put( Component.PAYLOADS, null );		}		additionalProperties.setProperty( Index.PropertyKeys.BATCHES, inputBasename.length );		if ( field != null ) additionalProperties.setProperty( Index.PropertyKeys.FIELD, field );		usedIndex = new int[ numIndices ];		frequency = new int[ numIndices ];		position = new int[ maxCount ];		numberOfDocuments = combineNumberOfDocuments();				if ( ( hasCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! haveCounts ) throw new IllegalArgumentException( "Some of the indices to be combined do not have counts." );		if ( ( hasPositions = writerFlags.containsKey( Component.POSITIONS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions." );		if ( ( hasPayloads = writerFlags.containsKey( Component.PAYLOADS ) ) && payload == null ) throw new IllegalArgumentException( "Indices to be combined do not have payloads." );		interleaved |= ! hasPositions || hasPayloads;		if ( ! metadataOnly ) {			if ( ! interleaved ) indexWriter = new BitStreamHPIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, quantum, height );			else if ( ! skips ) indexWriter = new BitStreamIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );			else indexWriter = new SkipBitStreamIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, quantum, height );			//else indexWriter = new SqrtSkipIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );		}	}		/** Return a index with given basename, loaded with options suitable to perform the combination.	 * 	 * <p>This basic implementation calls {@link it.unimi.dsi.mg4j.index.Index#getInstance(CharSequence, boolean, boolean)}	 * with all Boolean parameters set to false. Subclasses can override this	 * method to load more data.	 * 	 * @param basename an index basename.	 * @return an index loaded with the correct options for the combining strategy.	 */	protected BitStreamIndex getIndex( final CharSequence basename ) throws ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {		return (BitStreamIndex)Index.getInstance( basename, false, false, false );	}			/** Combines the number of documents.	 * 	 * @return the number of documents of the combined index.	 */	protected abstract int combineNumberOfDocuments();		/** A partial {@link IntIterator} implementation based on &gamma;-coded integers.	 * 	 * <p>Instances of this class adapt an {@link InputBitStream} to an {@link IntIterator}	 * by reading &gamma;-coded integers. The implementation is partial because {@link #hasNext()}	 * always returns true&mdash;the user must know in advance how many times {@link #nextInt()}	 * may be safely called. 	 * 	 * @see #sizes(int)	 */	protected static final class GammaCodedIntIterator extends AbstractIntIterator implements Closeable {		final private InputBitStream inputBitStream;		public GammaCodedIntIterator( final InputBitStream inputBitStream ) {			this.inputBitStream = inputBitStream;		}		/** Returns true.		 * @return true		 */		public boolean hasNext() { return true; }				/** Returns the next &gamma;-coded integer in the underlying {@link InputBitStream}. 		 * @return the result of {@link InputBitStream#readGamma()}.		 */		public int nextInt() { 			try {				return inputBitStream.readGamma();			}			catch ( IOException e ) {				throw new RuntimeException( e );			} 		}				/** Delegates to the underlying {@link InputBitStream}. */		public void close() throws IOException {			inputBitStream.close();		}	}		/** Returns an iterator on sizes.	 * 	 * <p>The purpose of this method is to provide {@link #combineSizes()} implementations with	 * a way to access the size list from a disk file or from {@link BitStreamIndex#sizes} transparently.	 * This mechanism is essential to ensure that size files are read exactly once.	 * 	 * <p>The caller should check whether the returned object implements {@link Closeable},	 * and, in this case, invoke {@link Closeable#close()} after usage.	 *	 * @param numIndex the number of an index.	 * @return an iterator on the sizes of the index.	 */		protected IntIterator sizes( int numIndex ) throws FileNotFoundException {		if ( index[ numIndex ].sizes != null ) return index[ numIndex ].sizes.listIterator();		LOGGER.debug( "Reading sizes from " + inputBasename[ numIndex ] + DiskBasedIndex.SIZES_EXTENSION );		return new GammaCodedIntIterator( new InputBitStream( inputBasename[ numIndex ] + DiskBasedIndex.SIZES_EXTENSION ) );	}		/** Combines size lists.	 * 	 * @return the maximum size of a document in the combined index.	 * @throws IOException	 */	protected abstract int combineSizes() throws IOException;		/** Combines several indices.	 * 	 * <p>When this method is called, exactly <code>numUsedIndices</code> entries	 * of {@link #usedIndex} contain, in increasing order, the indices containing	 * inverted lists for the current term. Implementations of this method must	 * combine the inverted list, save the total global count for the current	 * term and return the resulting frequency.	 * 	 * @param numUsedIndices the number of valid entries in {@link #usedIndex}.	 * @return the frequency of the combined lists.	 */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -