📄 scan.java
字号:
public String toString() { return this.getClass().getSimpleName() + "(" + basename + ":" + field + ")"; } /** * An accumulator for payloads. * * <P>This class is essentially a stripped-down version of {@link Scan} that just accumulate * payloads in a bitstream and releases them in batches. The main difference is that neither * sizes nor globcounts are saved (as they would not make much sense). */ protected static class PayloadAccumulator { /** * The current basename of the overall index (usually some basename postfixed with the field * name). */ private final String basename; /** The field name, if available. */ private final String field; /** The total number of postings (pairs term/document). */ private long totPostings; /** The directory where batches files will be created. */ private final File batchDir; /** The flag map for batches. */ final Map<Component, Coding> flags; /** The total number of documents. */ private int totDocuments; /** The number of documents indexed so far in the current batch. */ private int documentCount; /** The current batch. */ private int batch; /** The type of indexing for this scan. */ private final IndexingType indexingType; /** The pointers into the stream, if {@link #indexingType} is {@link IndexingType#REMAPPED}. */ private long position[]; /** The output stream underlying this accumulator. */ private FastByteArrayOutputStream accumulatorStream; /** The accumulating output bit stream, wrapping {@link #accumulatorStream}. */ private OutputBitStream accumulator; /** * The cutpoints of the batches (for building later a * {@link it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy}). */ protected final IntArrayList cutPoints; /** The payload accumulated by this accumulator. */ private final Payload payload; /** The maximum document ever seen in the current batch. */ private int maxDocInBatch; /** * Creates a new accumulator. * * @param basename the basename (usually a global filename followed by the field name, * separated by a dash). * @param payload the payload stored by this accumulator. * @param field the name of the accumulated field. * @param indexingType the type of indexing procedure. * @param documentsPerBatch the number of documents in each batch. * @param batchDir a directory for batch files; batch names will be relativised to this * directory if it is not <code>null</code>. */ public PayloadAccumulator( final String basename, final Payload payload, final String field, final IndexingType indexingType, final int documentsPerBatch, final File batchDir ) { this.basename = basename; this.payload = payload; this.field = field; this.indexingType = indexingType; if ( indexingType != IndexingType.STANDARD && indexingType != IndexingType.REMAPPED ) throw new UnsupportedOperationException( "Non-standard payload-based indices support only standard and remapped indexing" ); if ( indexingType == IndexingType.REMAPPED ) position = new long[ documentsPerBatch ]; this.batchDir = batchDir; this.cutPoints = new IntArrayList(); this.cutPoints.add( 0 ); flags = new EnumMap<Component, Coding>( CompressionFlags.DEFAULT_PAYLOAD_INDEX ); accumulatorStream = new FastByteArrayOutputStream(); accumulator = new OutputBitStream( accumulatorStream ); } /** Writes in compressed form the data currently accumulated. */ protected void writeData() throws IOException, ConfigurationException { final String batchBasename = batchBasename( batch, basename, batchDir ); LOGGER.debug( "Generating index " + batchBasename + "; documents: " + documentCount ); try { accumulator.flush(); final InputBitStream ibs = new InputBitStream( accumulatorStream.array ); final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, indexingType == IndexingType.STANDARD ? documentCount : maxDocInBatch + 1, false, flags ); indexWriter.newInvertedList(); indexWriter.writeFrequency( documentCount ); OutputBitStream obs; if ( indexingType == IndexingType.STANDARD ) { for ( int i = 0; i < documentCount; i++ ) { obs = indexWriter.newDocumentRecord(); indexWriter.writeDocumentPointer( obs, i ); payload.read( ibs ); indexWriter.writePayload( obs, payload ); } } else { // We sort position by pointed document pointer. Sorting.quickSort( position, 0, documentCount, new LongComparator() { public int compare( final long position0, final long position1 ) { try { ibs.position( position0 ); final int d0 = ibs.readDelta(); ibs.position( position1 ); return d0 - ibs.readDelta(); } catch ( IOException e ) { throw new RuntimeException( e ); } } } ); for ( int i = 0; i < documentCount; i++ ) { obs = indexWriter.newDocumentRecord(); ibs.position( position[ i ] ); indexWriter.writeDocumentPointer( obs, ibs.readDelta() ); payload.read( ibs ); indexWriter.writePayload( obs, payload ); } maxDocInBatch = 0; } indexWriter.close(); final Properties properties = indexWriter.properties(); totPostings += properties.getLong( "postings" ); properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 ); properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 ); properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() ); properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() ); properties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() ); if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field ); properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); // We *must* generate a fake term file, or index combination won't work. final PrintWriter termWriter = new PrintWriter( new FileWriter( batchBasename + DiskBasedIndex.TERMS_EXTENSION ) ); termWriter.println( "#" ); termWriter.close(); cutPoints.add( cutPoints.getInt( cutPoints.size() - 1 ) + documentCount ); accumulatorStream.reset(); accumulator.writtenBits( 0 ); documentCount = 0; maxDocInBatch = 0; batch++; } catch ( IOException e ) { LOGGER.fatal( "I/O Error on batch " + batch ); throw e; } } /** * Processes the payload of a given document. * * @param documentPointer the document pointer. * @param content the payload. */ public void processData( final int documentPointer, final Object content ) throws IOException { // We write document pointers only for non-standard indices. if ( indexingType != IndexingType.STANDARD ) { position[ documentCount ] = accumulator.writtenBits(); accumulator.writeDelta( documentPointer ); } // TODO: devise an out-of-memory-error check mechanism similar to that of ByteArrayPostingList. payload.set( content ); payload.write( accumulator ); if ( documentPointer > maxDocInBatch ) maxDocInBatch = documentPointer; documentCount++; totDocuments++; } /** * Closes this accumulator, releasing all resources. */ public void close() throws ConfigurationException, IOException { if ( documentCount > 0 ) writeData(); if ( totDocuments == 0 ) { // Special case: no document has been indexed. We generate an empty batch. final String batchBasename = batchBasename( 0, basename, batchDir ); LOGGER.debug( "Generating empty index " + batchBasename ); final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, 0, false, flags ); indexWriter.close(); final Properties properties = indexWriter.properties(); properties.setProperty( Index.PropertyKeys.SIZE, 0 ); properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 ); properties.setProperty( Index.PropertyKeys.MAXCOUNT, -1 ); properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 ); properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() ); properties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() ); if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field ); properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); new FileOutputStream( batchBasename + DiskBasedIndex.TERMS_EXTENSION ).close(); batch = 1; } accumulator = null; accumulatorStream = null; position = null; final Properties properties = new Properties(); if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field ); properties.setProperty( Index.PropertyKeys.BATCHES, batch ); properties.setProperty( Index.PropertyKeys.DOCUMENTS, totDocuments ); properties.setProperty( Index.PropertyKeys.POSTINGS, totPostings ); properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 ); properties.setProperty( Index.PropertyKeys.MAXCOUNT, -1 ); properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 ); properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() ); properties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() ); // This set of batches can be seen as a documental cluster index. final Properties clusterProperties = new Properties(); clusterProperties.addAll( properties ); clusterProperties.setProperty( Index.PropertyKeys.TERMS, 1 ); clusterProperties.setProperty( IndexCluster.PropertyKeys.BLOOM, false ); clusterProperties.setProperty( IndexCluster.PropertyKeys.FLAT, true ); if ( indexingType == IndexingType.STANDARD ) { clusterProperties.setProperty( Index.PropertyKeys.INDEXCLASS, DocumentalConcatenatedCluster.class.getName() ); BinIO.storeObject( new ContiguousDocumentalStrategy( cutPoints.toIntArray() ), basename + CLUSTER_STRATEGY_EXTENSION ); } else { clusterProperties.setProperty( Index.PropertyKeys.INDEXCLASS, DocumentalMergedCluster.class.getName() ); BinIO.storeObject( new IdentityDocumentalStrategy( batch, totDocuments ), basename + CLUSTER_STRATEGY_EXTENSION ); } clusterProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, basename + CLUSTER_STRATEGY_EXTENSION ); for ( int i = 0; i < batch; i++ ) clusterProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, batchBasename( i, basename, batchDir ) ); clusterProperties.save( basename + CLUSTER_PROPERTIES_EXTENSION ); properties.save( basename + DiskBasedIndex.PROPERTIES_EXTENSION ); } } public static int[] parseQualifiedSizes( final String[] qualifiedSizes, final String defaultSize, final int[] indexedField, final DocumentFactory factory ) throws ParseException { final int[] size = new int[ indexedField.length ]; String defaultSpec = defaultSize; IntArrayList indexedFields = IntArrayList.wrap( indexedField ); for ( int i = 0; i < qualifiedSizes.length; i++ ) if ( qualifiedSizes[ i ].indexOf( ':' ) == -1 ) defaultSpec = qualifiedSizes[ i ]; for ( int i = 0; i < size.length; i++ ) size[ i ] = (int)LongSizeStringParser.parseSize( defaultSpec ); for ( int i = 0; i < qualifiedSizes.length; i++ ) { final int split = qualifiedSizes[ i ].indexOf( ':' ); if ( split >= 0 ) { final String fieldName = qualifiedSizes[ i ].substring( 0, split ); final int field = factory.fieldIndex( fieldName ); if ( field < 0 ) throw new IllegalArgumentException( "Field " + fieldName + " is not part of factory " + factory.getClass().getName() ); if ( !indexedFields.contains( field ) ) throw new IllegalArgumentException( "Field " + factory.fieldName( field ) + " is not being indexed" ); size[ indexedFields.indexOf( field ) ] = (int)LongSizeStringParser.parseSize( qualifiedSizes[ i ].substring( split + 1 ) ); } } return size; } public static VirtualDocumentResolver[] parseVirtualDocumentResolver( final String[] virtualDocumentSpec, final int[] indexedField, final DocumentFactory factory ) { final VirtualDocumentResolver[] virtualDocumentResolver = new VirtualDocumentResolver[ indexedField.length ]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -