📄 scan.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
	public String toString() {		return this.getClass().getSimpleName() + "(" + basename + ":" + field + ")";	}	/**	 * An accumulator for payloads.	 * 	 * <P>This class is essentially a stripped-down version of {@link Scan} that just accumulate	 * payloads in a bitstream and releases them in batches. The main difference is that neither	 * sizes nor globcounts are saved (as they would not make much sense).	 */	protected static class PayloadAccumulator {		/**		 * The current basename of the overall index (usually some basename postfixed with the field		 * name).		 */		private final String basename;		/** The field name, if available. */		private final String field;		/** The total number of postings (pairs term/document). */		private long totPostings;		/** The directory where batches files will be created. */		private final File batchDir;		/** The flag map for batches. */		final Map<Component, Coding> flags;		/** The total number of documents. */		private int totDocuments;		/** The number of documents indexed so far in the current batch. */		private int documentCount;		/** The current batch. */		private int batch;		/** The type of indexing for this scan. */		private final IndexingType indexingType;		/** The pointers into the stream, if {@link #indexingType} is {@link IndexingType#REMAPPED}. */		private long position[];		/** The output stream underlying this accumulator. */		private FastByteArrayOutputStream accumulatorStream;		/** The accumulating output bit stream, wrapping {@link #accumulatorStream}. */		private OutputBitStream accumulator;		/**		 * The cutpoints of the batches (for building later a		 * {@link it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy}).		 */		protected final IntArrayList cutPoints;		/** The payload accumulated by this accumulator. */		private final Payload payload;		/** The maximum document ever seen in the current batch. */		private int maxDocInBatch;		/**		 * Creates a new accumulator.		 * 		 * @param basename the basename (usually a global filename followed by the field name,		 * separated by a dash).		 * @param payload the payload stored by this accumulator.		 * @param field the name of the accumulated field.		 * @param indexingType the type of indexing procedure.		 * @param documentsPerBatch the number of documents in each batch.		 * @param batchDir a directory for batch files; batch names will be relativised to this		 * directory if it is not <code>null</code>.		 */		public PayloadAccumulator( final String basename, final Payload payload, final String field, final IndexingType indexingType, final int documentsPerBatch, final File batchDir ) {			this.basename = basename;			this.payload = payload;			this.field = field;			this.indexingType = indexingType;			if ( indexingType != IndexingType.STANDARD && indexingType != IndexingType.REMAPPED ) throw new UnsupportedOperationException( "Non-standard payload-based indices support only standard and remapped indexing" );			if ( indexingType == IndexingType.REMAPPED ) position = new long[ documentsPerBatch ];			this.batchDir = batchDir;			this.cutPoints = new IntArrayList();			this.cutPoints.add( 0 );			flags = new EnumMap<Component, Coding>( CompressionFlags.DEFAULT_PAYLOAD_INDEX );			accumulatorStream = new FastByteArrayOutputStream();			accumulator = new OutputBitStream( accumulatorStream );		}		/** Writes in compressed form the data currently accumulated. */		protected void writeData() throws IOException, ConfigurationException {			final String batchBasename = batchBasename( batch, basename, batchDir );			LOGGER.debug( "Generating index " + batchBasename + "; documents: " + documentCount );			try {				accumulator.flush();				final InputBitStream ibs = new InputBitStream( accumulatorStream.array );				final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, indexingType == IndexingType.STANDARD ? documentCount : maxDocInBatch + 1, false, flags );				indexWriter.newInvertedList();				indexWriter.writeFrequency( documentCount );				OutputBitStream obs;				if ( indexingType == IndexingType.STANDARD ) {					for ( int i = 0; i < documentCount; i++ ) {						obs = indexWriter.newDocumentRecord();						indexWriter.writeDocumentPointer( obs, i );						payload.read( ibs );						indexWriter.writePayload( obs, payload );					}				}				else {					// We sort position by pointed document pointer.					Sorting.quickSort( position, 0, documentCount, new LongComparator() {						public int compare( final long position0, final long position1 ) {							try {								ibs.position( position0 );								final int d0 = ibs.readDelta();								ibs.position( position1 );								return d0 - ibs.readDelta();							}							catch ( IOException e ) {								throw new RuntimeException( e );							}						}					} );					for ( int i = 0; i < documentCount; i++ ) {						obs = indexWriter.newDocumentRecord();						ibs.position( position[ i ] );						indexWriter.writeDocumentPointer( obs, ibs.readDelta() );						payload.read( ibs );						indexWriter.writePayload( obs, payload );					}					maxDocInBatch = 0;				}				indexWriter.close();				final Properties properties = indexWriter.properties();				totPostings += properties.getLong( "postings" );				properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 );				properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 );				properties.setProperty( Index.PropertyKeys.SIZE, indexWriter.writtenBits() );				properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() );				properties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );				if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );				properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );				// We *must* generate a fake term file, or index combination won't work.				final PrintWriter termWriter = new PrintWriter( new FileWriter( batchBasename + DiskBasedIndex.TERMS_EXTENSION ) );				termWriter.println( "#" );				termWriter.close();				cutPoints.add( cutPoints.getInt( cutPoints.size() - 1 ) + documentCount );				accumulatorStream.reset();				accumulator.writtenBits( 0 );				documentCount = 0;				maxDocInBatch = 0;				batch++;			}			catch ( IOException e ) {				LOGGER.fatal( "I/O Error on batch " + batch );				throw e;			}		}		/**		 * Processes the payload of a given document.		 * 		 * @param documentPointer the document pointer.		 * @param content the payload.		 */		public void processData( final int documentPointer, final Object content ) throws IOException {			// We write document pointers only for non-standard indices.			if ( indexingType != IndexingType.STANDARD ) {				position[ documentCount ] = accumulator.writtenBits();				accumulator.writeDelta( documentPointer );			}			// TODO: devise an out-of-memory-error check mechanism similar to that of ByteArrayPostingList.			payload.set( content );			payload.write( accumulator );			if ( documentPointer > maxDocInBatch ) maxDocInBatch = documentPointer;			documentCount++;			totDocuments++;		}		/**		 * Closes this accumulator, releasing all resources.		 */		public void close() throws ConfigurationException, IOException {			if ( documentCount > 0 ) writeData();			if ( totDocuments == 0 ) {				// Special case: no document has been indexed. We generate an empty batch.				final String batchBasename = batchBasename( 0, basename, batchDir );				LOGGER.debug( "Generating empty index " + batchBasename );				final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, 0, false, flags );				indexWriter.close();				final Properties properties = indexWriter.properties();				properties.setProperty( Index.PropertyKeys.SIZE, 0 );				properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 );				properties.setProperty( Index.PropertyKeys.MAXCOUNT, -1 );				properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 );				properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() );				properties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );				if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );				properties.save( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION );				new FileOutputStream( batchBasename + DiskBasedIndex.TERMS_EXTENSION ).close();				batch = 1;			}			accumulator = null;			accumulatorStream = null;			position = null;			final Properties properties = new Properties();			if ( field != null ) properties.setProperty( Index.PropertyKeys.FIELD, field );			properties.setProperty( Index.PropertyKeys.BATCHES, batch );			properties.setProperty( Index.PropertyKeys.DOCUMENTS, totDocuments );			properties.setProperty( Index.PropertyKeys.POSTINGS, totPostings );			properties.setProperty( Index.PropertyKeys.OCCURRENCES, -1 );			properties.setProperty( Index.PropertyKeys.MAXCOUNT, -1 );			properties.setProperty( Index.PropertyKeys.MAXDOCSIZE, -1 );			properties.setProperty( Index.PropertyKeys.TERMPROCESSOR, NullTermProcessor.class.getName() );			properties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );			// This set of batches can be seen as a documental cluster index.			final Properties clusterProperties = new Properties();			clusterProperties.addAll( properties );			clusterProperties.setProperty( Index.PropertyKeys.TERMS, 1 );			clusterProperties.setProperty( IndexCluster.PropertyKeys.BLOOM, false );			clusterProperties.setProperty( IndexCluster.PropertyKeys.FLAT, true );			if ( indexingType == IndexingType.STANDARD ) {				clusterProperties.setProperty( Index.PropertyKeys.INDEXCLASS, DocumentalConcatenatedCluster.class.getName() );				BinIO.storeObject( new ContiguousDocumentalStrategy( cutPoints.toIntArray() ), basename + CLUSTER_STRATEGY_EXTENSION );			}			else {				clusterProperties.setProperty( Index.PropertyKeys.INDEXCLASS, DocumentalMergedCluster.class.getName() );				BinIO.storeObject( new IdentityDocumentalStrategy( batch, totDocuments ), basename + CLUSTER_STRATEGY_EXTENSION );			}			clusterProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, basename + CLUSTER_STRATEGY_EXTENSION );			for ( int i = 0; i < batch; i++ )				clusterProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, batchBasename( i, basename, batchDir ) );			clusterProperties.save( basename + CLUSTER_PROPERTIES_EXTENSION );			properties.save( basename + DiskBasedIndex.PROPERTIES_EXTENSION );		}	}	public static int[] parseQualifiedSizes( final String[] qualifiedSizes, final String defaultSize, final int[] indexedField, final DocumentFactory factory ) throws ParseException {		final int[] size = new int[ indexedField.length ];		String defaultSpec = defaultSize;		IntArrayList indexedFields = IntArrayList.wrap( indexedField );		for ( int i = 0; i < qualifiedSizes.length; i++ )			if ( qualifiedSizes[ i ].indexOf( ':' ) == -1 ) defaultSpec = qualifiedSizes[ i ];		for ( int i = 0; i < size.length; i++ )			size[ i ] = (int)LongSizeStringParser.parseSize( defaultSpec );		for ( int i = 0; i < qualifiedSizes.length; i++ ) {			final int split = qualifiedSizes[ i ].indexOf( ':' );			if ( split >= 0 ) {				final String fieldName = qualifiedSizes[ i ].substring( 0, split );				final int field = factory.fieldIndex( fieldName );				if ( field < 0 ) throw new IllegalArgumentException( "Field " + fieldName + " is not part of factory " + factory.getClass().getName() );				if ( !indexedFields.contains( field ) ) throw new IllegalArgumentException( "Field " + factory.fieldName( field ) + " is not being indexed" );				size[ indexedFields.indexOf( field ) ] = (int)LongSizeStringParser.parseSize( qualifiedSizes[ i ].substring( split + 1 ) );			}		}		return size;	}	public static VirtualDocumentResolver[] parseVirtualDocumentResolver( final String[] virtualDocumentSpec, final int[] indexedField, final DocumentFactory factory ) {		final VirtualDocumentResolver[] virtualDocumentResolver = new VirtualDocumentResolver[ indexedField.length ]
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -