📄 scan.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
		/** A remapped index&mdash;documents will be provided in separate calls, but in any order. */		REMAPPED,		/**		 * A virtual index&mdash;documents will be provided in any order, and a document may appear		 * several times.		 */		VIRTUAL	}	/** An interface that describes a virtual document fragment.	 *	 * When indexing in {@link IndexingType#VIRTUAL} mode, documents are composed	 * by fragments (typically, some text surrounding an anchor) that are referred	 * to a document by some spefication (in the previous case, the content <samp>href</samp> attribute	 * of the anchor). This interface is used to describe such fragments (see, e.g.,	 * {@link AnchorExtractor}).	 * 	 * @see VirtualDocumentResolver	 */		public static interface VirtualDocumentFragment extends Serializable {		/** The specification of the document to which this fragment belong.		 * 		 * @return the specification of the document to which this fragment belong.		 * @see VirtualDocumentResolver		 */		public MutableString documentSpecifier();		/** The textual content of this fragment.		 * 		 * @return the textual content of this fragment.		 */		public MutableString text();	}		/** The extension of the property file for the cluster associated to a scan. */	private static final String CLUSTER_STRATEGY_EXTENSION = ".cluster.strategy";	/** The extension of the strategy for the cluster associated to a scan. */	public static final String CLUSTER_PROPERTIES_EXTENSION = ".cluster.properties";	/** The frequency with which we report the current number of terms. */	private static final int TERM_REPORT_STEP = 1000000;	/** The initial size of the term map. */	private static final int INITIAL_TERM_MAP_SIZE = 1000;	/** A term processor to be applied during the indexing phase. */	private final TermProcessor termProcessor;	/**	 * The current basename of the overall index (usually some basename postfixed with the field	 * name).	 */	private final String basename;	/** The field name, if available. */	private final String field;	/** The size of a buffer. */	private final int bufferSize;	/** The directory where batches files will be created. */	private final File batchDir;	/** The flag map for batches. */	final Map<Component, Coding> flags;	/** A map containing the terms seen so far. */	private Object2ReferenceOpenHashMap<MutableString, ByteArrayPostingList> termMap;	/**	 * The output bit stream for size information. For {@link IndexingType#STANDARD} indexing, the	 * list of &gamma;-coded document sizes. For {@link IndexingType#REMAPPED} indexing, a list of	 * &gamma;-coded document numbers and document sizes.	 */	private OutputBitStream sizes;	/** The total number of occurrences. */	private long totOccurrences;	/** The total number of postings (pairs term/document). */	private long totPostings;	/** The total number of documents. */	private int totDocuments;	/** Maximum occurrence count. */	private int maxCount;	/** Maximum size in words of documents seen so far. */	private int globMaxDocSize;	/** The number of documents indexed so far in the current batch. */	private int documentCount;	/** The number of terms seen so far in the current batch. */	private int numTerms;	/** Maximum size in words of documents seen so far in the current batch. */	int maxDocSize;	/** The current batch. */	private int batch;	/** The number of occurrences in the current batch. */	private int numOccurrences;	/** If true, this class experienced an {@link OutOfMemoryError} during some buffer reallocation. */	public boolean outOfMemoryError;	/** The type of indexing for this scan. */	private final IndexingType indexingType;	/** Whether {@link #indexingType} is {@link IndexingType#STANDARD}. */	private final boolean indexingIsStandard;	/** Whether {@link #indexingType} is {@link IndexingType#REMAPPED}. */	private final boolean indexingIsRemapped;	/** Whether {@link #indexingType} is {@link IndexingType#VIRTUAL}. */	private final boolean indexingIsVirtual;	/** The number of occurrences generated by the current document. */	private int occsInCurrDoc;	/** The current maximum position for each document, if the field indexed is virtual. */	protected int[] currMaxPos;	/**	 * The maximum document pointer ever seen (could be different from the last document indexed if	 * {@link #indexingType} is not {@link IndexingType#STANDARD}).	 */	private int maxDocInBatch;	/** The width of the artificial gap introduced between virtual-document fragments. */	protected int virtualDocumentGap;	/** A builder that will be used to zip the document sequence while we pass through it. */	private final ZipDocumentCollectionBuilder builder;	/**	 * The cutpoints of the batches (for building later a	 * {@link it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy}).	 */	protected final IntArrayList cutPoints;	/**	 * Creates a new scanner instance.	 * 	 * @param basename the basename (usually a global filename followed by the field name, separated	 * by a dash).	 * @param field the field to be indexed.	 * @param termProcessor the term processor for this index.	 * @param documentsAreInOrder if true, documents will be served in increasing order.	 * @param bufferSize the buffer size used in all I/O.	 * @param builder a builder used to create a compressed document collection on the fly.	 * @param batchDir a directory for batch files; batch names will be relativised to this	 * directory if it is not <code>null</code>.	 * @throws FileNotFoundException	 */	public Scan( final String basename, final String field, final TermProcessor termProcessor, final boolean documentsAreInOrder, final int bufferSize, final ZipDocumentCollectionBuilder builder,			final File batchDir ) throws FileNotFoundException {		this( basename, field, termProcessor, documentsAreInOrder ? IndexingType.STANDARD : IndexingType.VIRTUAL, 0, 0, bufferSize, builder, batchDir );	}	/**	 * Creates a new scanner instance.	 * 	 * @throws FileNotFoundException	 * 	 */	public Scan( final String basename, final String field, final TermProcessor termProcessor, final IndexingType indexingType, final int bufferSize, final ZipDocumentCollectionBuilder builder,			final File batchDir ) throws FileNotFoundException {		this( basename, field, termProcessor, indexingType, 0, 0, bufferSize, builder, batchDir );	}	/**	 * Creates a new scanner instance.	 * 	 * @param basename the basename (usually a global filename followed by the field name, separated	 * by a dash).	 * @param field the field to be indexed.	 * @param termProcessor the term processor for this index.	 * @param indexingType the type of indexing procedure.	 * @param numVirtualDocs the number of virtual documents that will be used, in case of a virtual	 * index; otherwise, immaterial.	 * @param virtualDocumentGap the artificial gap introduced between virtual documents fragments, in case	 * of a virtual index; otherwise, immaterial.	 * @param bufferSize the buffer size used in all I/O.	 * @param builder a builder used to create a compressed document collection on the fly.	 * @param batchDir a directory for batch files; batch names will be relativised to this	 * directory if it is not <code>null</code>.	 */	public Scan( final String basename, final String field, final TermProcessor termProcessor, final IndexingType indexingType, final int numVirtualDocs, final int virtualDocumentGap, final int bufferSize,			final ZipDocumentCollectionBuilder builder, final File batchDir ) throws FileNotFoundException {		this.basename = basename;		this.field = field;		this.indexingType = indexingType;		this.termProcessor = termProcessor;		this.bufferSize = bufferSize;		this.builder = builder;		this.batchDir = batchDir;		this.virtualDocumentGap = virtualDocumentGap;		this.cutPoints = new IntArrayList();		this.cutPoints.add( 0 );		termMap = new Object2ReferenceOpenHashMap<MutableString, ByteArrayPostingList>( INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR );		flags = new EnumMap<Component, Coding>( CompressionFlags.DEFAULT_STANDARD_INDEX );		maxDocInBatch = -1;				indexingIsStandard = indexingType == IndexingType.STANDARD;		indexingIsRemapped = indexingType == IndexingType.REMAPPED;		indexingIsVirtual = indexingType == IndexingType.VIRTUAL;		if ( indexingIsVirtual && virtualDocumentGap == 0 ) throw new IllegalArgumentException( "Illegal virtual document gap: " + virtualDocumentGap );				if ( indexingIsVirtual ) currMaxPos = new int[ numVirtualDocs ];		openSizeBitStream();	}	/** Cleans all intermediate files generated by a run of this class.	 *	 * @param basename the basename of the run.	 * @param batches the number of generated batches.	 * @param batchDir if not <code>null</code>, a temporary directory where the batches are located.	 */	public static void cleanup( final String basename, final int batches, final File batchDir ) throws IOException {		final String basepath = ( batchDir != null ? new File( basename ) : new File( basename ) ).getCanonicalPath();		new File( basepath.toString() + CLUSTER_STRATEGY_EXTENSION ).delete();		new File( basepath.toString()  + CLUSTER_PROPERTIES_EXTENSION ).delete();		for( int i = 0; i < batches; i++ ) {			final String batchBasename = batchBasename( i, basename, batchDir );			new File( batchBasename + DiskBasedIndex.FREQUENCIES_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.INDEX_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.OFFSETS_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.SIZES_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.STATS_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.TERMS_EXTENSION ).delete();			new File( batchBasename + DiskBasedIndex.UNSORTED_TERMS_EXTENSION ).delete();		}	}		/**	 * Returns the name of a batch.	 * 	 * <p>You can override this method if you prefer a different batch naming scheme.	 * 	 * @param batch the batch number.	 * @param basename the index basename.	 * @param batchDir if not <code>null</code>, a temporary directory for batches.	 * @return simply <code>basename@batch</code>, if <code>batchDir</code> is	 * <code>null</code>; otherwise, we relativise the name to <code>batchDir</code>.	 */	protected static String batchBasename( int batch, String basename, final File batchDir ) {		return batchDir != null ? new File( batchDir, basename + "@" + batch ).toString() : basename + "@" + batch;	}	/**	 * Dumps the current batch on disk as an index.	 *	 * @return the number of occurrences contained in the batch. 	 */	protected long dumpBatch() throws IOException, ConfigurationException {		outOfMemoryError = false;		final String batchBasename = batchBasename( batch, basename, batchDir );		LOGGER.debug( "Generating index " + batchBasename + "; documents: " + documentCount + "; terms:" + numTerms + "; occurrences: " + numOccurrences );		// We write down all term in appearance order in termArray.		MutableString[] termArray = termMap.keySet().toArray( new MutableString[ numTerms ] );		if ( ASSERTS ) assert numTerms == termMap.size();		if ( ! indexingIsVirtual ) sizes.close();		// We sort the terms appering in the batch and write them on disk.		Sorting.quickSort( termArray );		final PrintWriter pw = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( batchBasename + DiskBasedIndex.TERMS_EXTENSION ), bufferSize ), "UTF-8" ) );		for ( MutableString t : termArray )			t.println( pw );		pw.close();		try {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -