📄 combine.java
字号:
final int bufferSize, final Map<Component,Coding> writerFlags, boolean interleaved, final boolean skips, final int quantum, final int height, final int skipBufferSize, final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { this.logInterval = logInterval; LOGGER.debug( "Combining indices " + Arrays.toString( inputBasename ) + " into " + outputBasename ); this.inputBasename = inputBasename; this.outputBasename = outputBasename; this.metadataOnly = metadataOnly; this.bufferSize = bufferSize; numIndices = inputBasename.length; index = new BitStreamIndex[ numIndices ]; indexReader = new IndexReader[ numIndices ]; indexIterator = new IndexIterator[ numIndices ]; globCounts = new InputBitStream[ numIndices ]; term = new MutableString[ numIndices ]; termReader = new FastBufferedReader[ numIndices ]; termQueue = new ObjectHeapSemiIndirectPriorityQueue<MutableString>( term, numIndices ); // This will remain set if *all* indices to be merged agree boolean haveCounts = true, havePositions = true; /* This will be set if *all* indices to be merged agree. Moreover, if some * indices disagree we will emit a warning. */ TermProcessor termProcessor = null; /* This will be set if *all* indices to be merged agree. Moreover, if some * indices disagree we will emit a warning. */ Payload payload = null; String field = null; writeGlobCounts = writeSizes = true; boolean someGlobCounts = false, someSizes = false; for( int i = 0; i < numIndices; i++ ) { index[ i ] = getIndex( inputBasename[ i ] ); if ( i == 0 ) { termProcessor = index[ 0 ].termProcessor.copy(); payload = index[ 0 ].payload == null ? null : index[ 0 ].payload.copy(); } else { if ( ! termProcessor.equals( index[ i ].termProcessor ) ) throw new IllegalStateException( "The term processor of the first index (" + termProcessor + ") is different from the term processor of index " + i + " (" + index[ i ].termProcessor + ")" ); if ( ( payload == null ) != ( index[ i ].payload == null ) || payload != null && ! payload.compatibleWith( index[ i ].payload ) ) throw new IllegalStateException( "The payload specification of index " + index[ 0 ] + " is not compatible with that of index " + index[ i ] ); } if ( index[ i ].field != null ) { if ( field == null ) { if ( i != 0 ) LOGGER.warn( "Not all indices specify the field property" ); field = index[ i ].field; } else if ( ! field.equals( index[ i ].field ) ) LOGGER.warn( "Index fields disagree: \"" + field + "\", \"" + index[ i ].field + "\"" ); } haveCounts &= index[ i ].hasCounts; havePositions &= index[ i ].hasPositions; maxCount = Math.max( maxCount, index[ i ].maxCount ); if ( ! metadataOnly ) indexReader[ i ] = index[ i ].getReader( bufferSize ); if ( index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES ) == -1 ) numberOfOccurrences = -1; if ( numberOfOccurrences != -1 ) numberOfOccurrences += index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES ); final File globCountsFile = new File( inputBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION ); writeGlobCounts &= globCountsFile.exists(); someGlobCounts |= globCountsFile.exists(); if ( writeGlobCounts ) globCounts[ i ] = new InputBitStream( globCountsFile ); final File sizesFile = new File( inputBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION ); writeSizes &= sizesFile.exists(); someSizes |= sizesFile.exists(); term[ i ] = new MutableString(); termReader[ i ] = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ); if ( termReader[ i ].readLine( term[ i ] ) != null ) termQueue.enqueue( i ); // If the term list is nonempty, we enqueue it } if ( writeGlobCounts != someGlobCounts ) LOGGER.warn( "Some (but not all) global-counts file missing" ); if ( writeSizes != someSizes ) LOGGER.warn( "Some (but not all) sizes file missing" ); additionalProperties = new Properties(); additionalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) ); if ( payload != null ) { additionalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() ); //writerFlags.put( Component.PAYLOADS, null ); } additionalProperties.setProperty( Index.PropertyKeys.BATCHES, inputBasename.length ); if ( field != null ) additionalProperties.setProperty( Index.PropertyKeys.FIELD, field ); usedIndex = new int[ numIndices ]; frequency = new int[ numIndices ]; position = new int[ maxCount ]; numberOfDocuments = combineNumberOfDocuments(); if ( ( hasCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! haveCounts ) throw new IllegalArgumentException( "Some of the indices to be combined do not have counts." ); if ( ( hasPositions = writerFlags.containsKey( Component.POSITIONS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions." ); if ( ( hasPayloads = writerFlags.containsKey( Component.PAYLOADS ) ) && payload == null ) throw new IllegalArgumentException( "Indices to be combined do not have payloads." ); interleaved |= ! hasPositions || hasPayloads; if ( ! metadataOnly ) { if ( ! interleaved ) indexWriter = new BitStreamHPIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, quantum, height ); else if ( ! skips ) indexWriter = new BitStreamIndexWriter( outputBasename, numberOfDocuments, true, writerFlags ); else indexWriter = new SkipBitStreamIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, quantum, height ); //else indexWriter = new SqrtSkipIndexWriter( outputBasename, numberOfDocuments, true, writerFlags ); } } /** Return a index with given basename, loaded with options suitable to perform the combination. * * <p>This basic implementation calls {@link it.unimi.dsi.mg4j.index.Index#getInstance(CharSequence, boolean, boolean)} * with all Boolean parameters set to false. Subclasses can override this * method to load more data. * * @param basename an index basename. * @return an index loaded with the correct options for the combining strategy. */ protected BitStreamIndex getIndex( final CharSequence basename ) throws ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { return (BitStreamIndex)Index.getInstance( basename, false, false, false ); } /** Combines the number of documents. * * @return the number of documents of the combined index. */ protected abstract int combineNumberOfDocuments(); /** A partial {@link IntIterator} implementation based on γ-coded integers. * * <p>Instances of this class adapt an {@link InputBitStream} to an {@link IntIterator} * by reading γ-coded integers. The implementation is partial because {@link #hasNext()} * always returns true—the user must know in advance how many times {@link #nextInt()} * may be safely called. * * @see #sizes(int) */ protected static final class GammaCodedIntIterator extends AbstractIntIterator implements Closeable { final private InputBitStream inputBitStream; public GammaCodedIntIterator( final InputBitStream inputBitStream ) { this.inputBitStream = inputBitStream; } /** Returns true. * @return true */ public boolean hasNext() { return true; } /** Returns the next γ-coded integer in the underlying {@link InputBitStream}. * @return the result of {@link InputBitStream#readGamma()}. */ public int nextInt() { try { return inputBitStream.readGamma(); } catch ( IOException e ) { throw new RuntimeException( e ); } } /** Delegates to the underlying {@link InputBitStream}. */ public void close() throws IOException { inputBitStream.close(); } } /** Returns an iterator on sizes. * * <p>The purpose of this method is to provide {@link #combineSizes()} implementations with * a way to access the size list from a disk file or from {@link BitStreamIndex#sizes} transparently. * This mechanism is essential to ensure that size files are read exactly once. * * <p>The caller should check whether the returned object implements {@link Closeable}, * and, in this case, invoke {@link Closeable#close()} after usage. * * @param numIndex the number of an index. * @return an iterator on the sizes of the index. */ protected IntIterator sizes( int numIndex ) throws FileNotFoundException { if ( index[ numIndex ].sizes != null ) return index[ numIndex ].sizes.listIterator(); LOGGER.debug( "Reading sizes from " + inputBasename[ numIndex ] + DiskBasedIndex.SIZES_EXTENSION ); return new GammaCodedIntIterator( new InputBitStream( inputBasename[ numIndex ] + DiskBasedIndex.SIZES_EXTENSION ) ); } /** Combines size lists. * * @return the maximum size of a document in the combined index. * @throws IOException */ protected abstract int combineSizes() throws IOException; /** Combines several indices. * * <p>When this method is called, exactly <code>numUsedIndices</code> entries * of {@link #usedIndex} contain, in increasing order, the indices containing * inverted lists for the current term. Implementations of this method must * combine the inverted list, save the total global count for the current * term and return the resulting frequency. * * @param numUsedIndices the number of valid entries in {@link #usedIndex}. * @return the frequency of the combined lists. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -