📄 scan.java
字号:
/** A remapped index—documents will be provided in separate calls, but in any order. */ REMAPPED, /** * A virtual index—documents will be provided in any order, and a document may appear * several times. */ VIRTUAL } /** An interface that describes a virtual document fragment. * * When indexing in {@link IndexingType#VIRTUAL} mode, documents are composed * by fragments (typically, some text surrounding an anchor) that are referred * to a document by some spefication (in the previous case, the content <samp>href</samp> attribute * of the anchor). This interface is used to describe such fragments (see, e.g., * {@link AnchorExtractor}). * * @see VirtualDocumentResolver */ public static interface VirtualDocumentFragment extends Serializable { /** The specification of the document to which this fragment belong. * * @return the specification of the document to which this fragment belong. * @see VirtualDocumentResolver */ public MutableString documentSpecifier(); /** The textual content of this fragment. * * @return the textual content of this fragment. */ public MutableString text(); } /** The extension of the property file for the cluster associated to a scan. */ private static final String CLUSTER_STRATEGY_EXTENSION = ".cluster.strategy"; /** The extension of the strategy for the cluster associated to a scan. */ public static final String CLUSTER_PROPERTIES_EXTENSION = ".cluster.properties"; /** The frequency with which we report the current number of terms. */ private static final int TERM_REPORT_STEP = 1000000; /** The initial size of the term map. */ private static final int INITIAL_TERM_MAP_SIZE = 1000; /** A term processor to be applied during the indexing phase. */ private final TermProcessor termProcessor; /** * The current basename of the overall index (usually some basename postfixed with the field * name). */ private final String basename; /** The field name, if available. */ private final String field; /** The size of a buffer. */ private final int bufferSize; /** The directory where batches files will be created. */ private final File batchDir; /** The flag map for batches. */ final Map<Component, Coding> flags; /** A map containing the terms seen so far. */ private Object2ReferenceOpenHashMap<MutableString, ByteArrayPostingList> termMap; /** * The output bit stream for size information. For {@link IndexingType#STANDARD} indexing, the * list of γ-coded document sizes. For {@link IndexingType#REMAPPED} indexing, a list of * γ-coded document numbers and document sizes. */ private OutputBitStream sizes; /** The total number of occurrences. */ private long totOccurrences; /** The total number of postings (pairs term/document). */ private long totPostings; /** The total number of documents. */ private int totDocuments; /** Maximum occurrence count. */ private int maxCount; /** Maximum size in words of documents seen so far. */ private int globMaxDocSize; /** The number of documents indexed so far in the current batch. */ private int documentCount; /** The number of terms seen so far in the current batch. */ private int numTerms; /** Maximum size in words of documents seen so far in the current batch. */ int maxDocSize; /** The current batch. */ private int batch; /** The number of occurrences in the current batch. */ private int numOccurrences; /** If true, this class experienced an {@link OutOfMemoryError} during some buffer reallocation. */ public boolean outOfMemoryError; /** The type of indexing for this scan. */ private final IndexingType indexingType; /** Whether {@link #indexingType} is {@link IndexingType#STANDARD}. */ private final boolean indexingIsStandard; /** Whether {@link #indexingType} is {@link IndexingType#REMAPPED}. */ private final boolean indexingIsRemapped; /** Whether {@link #indexingType} is {@link IndexingType#VIRTUAL}. */ private final boolean indexingIsVirtual; /** The number of occurrences generated by the current document. */ private int occsInCurrDoc; /** The current maximum position for each document, if the field indexed is virtual. */ protected int[] currMaxPos; /** * The maximum document pointer ever seen (could be different from the last document indexed if * {@link #indexingType} is not {@link IndexingType#STANDARD}). */ private int maxDocInBatch; /** The width of the artificial gap introduced between virtual-document fragments. */ protected int virtualDocumentGap; /** A builder that will be used to zip the document sequence while we pass through it. */ private final ZipDocumentCollectionBuilder builder; /** * The cutpoints of the batches (for building later a * {@link it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy}). */ protected final IntArrayList cutPoints; /** * Creates a new scanner instance. * * @param basename the basename (usually a global filename followed by the field name, separated * by a dash). * @param field the field to be indexed. * @param termProcessor the term processor for this index. * @param documentsAreInOrder if true, documents will be served in increasing order. * @param bufferSize the buffer size used in all I/O. * @param builder a builder used to create a compressed document collection on the fly. * @param batchDir a directory for batch files; batch names will be relativised to this * directory if it is not <code>null</code>. * @throws FileNotFoundException */ public Scan( final String basename, final String field, final TermProcessor termProcessor, final boolean documentsAreInOrder, final int bufferSize, final ZipDocumentCollectionBuilder builder, final File batchDir ) throws FileNotFoundException { this( basename, field, termProcessor, documentsAreInOrder ? IndexingType.STANDARD : IndexingType.VIRTUAL, 0, 0, bufferSize, builder, batchDir ); } /** * Creates a new scanner instance. * * @throws FileNotFoundException * */ public Scan( final String basename, final String field, final TermProcessor termProcessor, final IndexingType indexingType, final int bufferSize, final ZipDocumentCollectionBuilder builder, final File batchDir ) throws FileNotFoundException { this( basename, field, termProcessor, indexingType, 0, 0, bufferSize, builder, batchDir ); } /** * Creates a new scanner instance. * * @param basename the basename (usually a global filename followed by the field name, separated * by a dash). * @param field the field to be indexed. * @param termProcessor the term processor for this index. * @param indexingType the type of indexing procedure. * @param numVirtualDocs the number of virtual documents that will be used, in case of a virtual * index; otherwise, immaterial. * @param virtualDocumentGap the artificial gap introduced between virtual documents fragments, in case * of a virtual index; otherwise, immaterial. * @param bufferSize the buffer size used in all I/O. * @param builder a builder used to create a compressed document collection on the fly. * @param batchDir a directory for batch files; batch names will be relativised to this * directory if it is not <code>null</code>. */ public Scan( final String basename, final String field, final TermProcessor termProcessor, final IndexingType indexingType, final int numVirtualDocs, final int virtualDocumentGap, final int bufferSize, final ZipDocumentCollectionBuilder builder, final File batchDir ) throws FileNotFoundException { this.basename = basename; this.field = field; this.indexingType = indexingType; this.termProcessor = termProcessor; this.bufferSize = bufferSize; this.builder = builder; this.batchDir = batchDir; this.virtualDocumentGap = virtualDocumentGap; this.cutPoints = new IntArrayList(); this.cutPoints.add( 0 ); termMap = new Object2ReferenceOpenHashMap<MutableString, ByteArrayPostingList>( INITIAL_TERM_MAP_SIZE, Hash.FAST_LOAD_FACTOR ); flags = new EnumMap<Component, Coding>( CompressionFlags.DEFAULT_STANDARD_INDEX ); maxDocInBatch = -1; indexingIsStandard = indexingType == IndexingType.STANDARD; indexingIsRemapped = indexingType == IndexingType.REMAPPED; indexingIsVirtual = indexingType == IndexingType.VIRTUAL; if ( indexingIsVirtual && virtualDocumentGap == 0 ) throw new IllegalArgumentException( "Illegal virtual document gap: " + virtualDocumentGap ); if ( indexingIsVirtual ) currMaxPos = new int[ numVirtualDocs ]; openSizeBitStream(); } /** Cleans all intermediate files generated by a run of this class. * * @param basename the basename of the run. * @param batches the number of generated batches. * @param batchDir if not <code>null</code>, a temporary directory where the batches are located. */ public static void cleanup( final String basename, final int batches, final File batchDir ) throws IOException { final String basepath = ( batchDir != null ? new File( basename ) : new File( basename ) ).getCanonicalPath(); new File( basepath.toString() + CLUSTER_STRATEGY_EXTENSION ).delete(); new File( basepath.toString() + CLUSTER_PROPERTIES_EXTENSION ).delete(); for( int i = 0; i < batches; i++ ) { final String batchBasename = batchBasename( i, basename, batchDir ); new File( batchBasename + DiskBasedIndex.FREQUENCIES_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.INDEX_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.OFFSETS_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.SIZES_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.STATS_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.TERMS_EXTENSION ).delete(); new File( batchBasename + DiskBasedIndex.UNSORTED_TERMS_EXTENSION ).delete(); } } /** * Returns the name of a batch. * * <p>You can override this method if you prefer a different batch naming scheme. * * @param batch the batch number. * @param basename the index basename. * @param batchDir if not <code>null</code>, a temporary directory for batches. * @return simply <code>basename@batch</code>, if <code>batchDir</code> is * <code>null</code>; otherwise, we relativise the name to <code>batchDir</code>. */ protected static String batchBasename( int batch, String basename, final File batchDir ) { return batchDir != null ? new File( batchDir, basename + "@" + batch ).toString() : basename + "@" + batch; } /** * Dumps the current batch on disk as an index. * * @return the number of occurrences contained in the batch. */ protected long dumpBatch() throws IOException, ConfigurationException { outOfMemoryError = false; final String batchBasename = batchBasename( batch, basename, batchDir ); LOGGER.debug( "Generating index " + batchBasename + "; documents: " + documentCount + "; terms:" + numTerms + "; occurrences: " + numOccurrences ); // We write down all term in appearance order in termArray. MutableString[] termArray = termMap.keySet().toArray( new MutableString[ numTerms ] ); if ( ASSERTS ) assert numTerms == termMap.size(); if ( ! indexingIsVirtual ) sizes.close(); // We sort the terms appering in the batch and write them on disk. Sorting.quickSort( termArray ); final PrintWriter pw = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( batchBasename + DiskBasedIndex.TERMS_EXTENSION ), bufferSize ), "UTF-8" ) ); for ( MutableString t : termArray ) t.println( pw ); pw.close(); try {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -