📄 partitiondocumentally.java
字号:
package it.unimi.dsi.mg4j.tool;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntList;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.CachingOutputBitStream;import it.unimi.dsi.mg4j.index.CompressionFlags;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.IndexReader;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.index.IndexWriter;import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.BloomFilter;import it.unimi.dsi.util.ImmutableExternalPrefixMap;import it.unimi.dsi.util.PrefixMap;import it.unimi.dsi.util.Properties;import it.unimi.dsi.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.StringMap;import java.io.BufferedWriter;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintWriter;import java.net.URISyntaxException;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.configuration.ConfigurationMap;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** Partitions an index documentally. * * <p>A global index is partitioned documentally by providing a {@link DocumentalPartitioningStrategy} * that specifies a destination local index for each document, and a local document pointer. The global index * is scanned, and the postings are partitioned among the local indices using the provided strategy. For instance, * a {@link ContiguousDocumentalStrategy} divides an index into blocks of contiguous documents. * * <p>Since each local index contains a (proper) subset of the original set of documents, it contains in general a (proper) * subset of the terms in the global index. Thus, the local term numbers and the global term numbers will not in general coincide. * As a result, when a set of local indices is accessed transparently as a single index * using a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster}, * a call to {@link it.unimi.dsi.mg4j.index.Index#documents(int)} will throw an {@link java.lang.UnsupportedOperationException}, * because there is no way to map the global term numbers to local term numbers. * * <p>On the other hand, a call to {@link it.unimi.dsi.mg4j.index.Index#documents(CharSequence)} will be passed each local index to * build a global iterator. To speed up this phase for not-so-frequent terms, when partitioning an index you can require * the construction of {@linkplain BloomFilter Bloom filters} that will be used to try to avoid * inquiring indices that do not contain a term. The precision of the filters is settable. * * <p>The property file will use a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster} unless you provide * a {@link ContiguousDocumentalStrategy}, in which case a * {@link it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster} will be used instead. Note that there might * be other cases in which the latter is adapt, in which case you can edit manually the property file. * * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps} * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g., * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}). * * <h2>Write-once output and distributed index partitioning</h2> * * Plase see {@link it.unimi.dsi.mg4j.tool.PartitionLexically}—the same comments apply. * * @author Alessandro Arrabito * @author Sebastiano Vigna * * @since 1.0.1 */public class PartitionDocumentally { private final static Logger LOGGER = Util.getLogger( PartitionDocumentally.class ); /** The default buffer size for all involved indices. */ public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024; /** The number of local indices. */ private final int numIndices; /** The output basenames. */ private final String outputBasename; /** The array of local output basenames. */ private final String[] localBasename; /** The input basename. */ private final String inputBasename; /** The properties of the input index. */ private final Properties inputProperties; /** The size of I/O buffers. */ private final int bufferSize; /** The filename of the strategy used to partition the index. */ private final String strategyFilename; /** The strategy used to perform the partitioning. */ private final DocumentalPartitioningStrategy strategy; /** The additional local properties of each local index. */ private final Properties[] strategyProperties; /** The logging interval. */ private final long logInterval; /** The global index to be partitioned. */ private final BitStreamIndex globalIndex; /** A reader on {@link #globalIndex}. */ private final IndexReader indexReader; /** A reader for the terms of the global index. */ private final FastBufferedReader terms; /** An index writer for each local index. */ private final IndexWriter[] indexWriter; /** Whether each {@link #indexWriter} has counts. */ private final boolean haveCounts; /** Whether each {@link #indexWriter} has positions. */ private final boolean havePositions; /** Whether each {@link #indexWriter} has payloads. */ private final boolean havePayloads; /** A bit output stream for global counts of each local index. */ private final OutputBitStream[] localGlobCounts; /** A bit output stream for the frequencies of each local index. */ private final OutputBitStream[] localFrequencies; /** A print writer for the terms of each local index. */ private final PrintWriter[] localTerms; /** The maximum size of a document in each local index. */ private final int[] maxDocSize; /** The maximum number of positions in each local index. */ private final int[] maxDocPos; /** The number of terms in each local index. */ private final int[] numTerms; /** The number of postings in each local index. */ private final long[] numPostings; /** The number of occurrences in each local index. */ private final long[] numOccurrences; /** The global count for each local index. */ private final long[] globCount; /** The required precision for Bloom filters (0 means no filter). */ private final int bloomFilterPrecision; public PartitionDocumentally( final String inputBasename, final String outputBasename, final DocumentalPartitioningStrategy strategy, final String strategyFilename, final int bloomFilterPrecision, final int bufferSize, final Map<Component,Coding> writerFlags, boolean interleaved, final boolean skips, final int quantum, final int height, final int skipBufferSize, final long logInterval ) throws ConfigurationException, IOException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException { this.inputBasename = inputBasename; this.outputBasename = outputBasename; this.strategy = strategy; this.strategyFilename = strategyFilename; this.strategyProperties = strategy.properties(); this.bufferSize = bufferSize; this.logInterval = logInterval; this.bloomFilterPrecision = bloomFilterPrecision; numIndices = strategy.numberOfLocalIndices(); final Coding positionCoding = writerFlags.get( Component.POSITIONS ); inputProperties = new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ); globalIndex = DiskBasedIndex.getInstance( inputBasename, inputProperties, false, positionCoding == Coding.GOLOMB || positionCoding == Coding.INTERPOLATIVE, false, null ); indexReader = globalIndex.getReader(); localBasename = new String[ numIndices ]; for( int i = 0; i < numIndices; i++ ) localBasename[ i ] = outputBasename + "-" + i; localGlobCounts = new OutputBitStream[ numIndices ]; localFrequencies = new OutputBitStream[ numIndices ]; localTerms = new PrintWriter[ numIndices ]; maxDocSize = new int[ numIndices ]; maxDocPos = new int[ numIndices ]; numTerms = new int[ numIndices ]; globCount = new long[ numIndices ]; numOccurrences = new long[ numIndices ]; numPostings = new long[ numIndices ]; indexWriter = new IndexWriter[ numIndices ]; if ( ( havePayloads = writerFlags.containsKey( Component.PAYLOADS ) ) && ! globalIndex.hasPayloads ) throw new IllegalArgumentException( "You requested payloads, but the global index does not contain them." ); if ( ( haveCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! globalIndex.hasCounts ) throw new IllegalArgumentException( "You requested counts, but the global index does not contain them." ); if ( ( havePositions = writerFlags.containsKey( Component.POSITIONS ) ) && ! globalIndex.hasPositions ) throw new IllegalArgumentException( "You requested positions, but the global index does not contain them." ); interleaved |= ! havePositions || havePayloads; for ( int i = 0; i < numIndices; i++ ) { String name = localBasename[ i ]; if ( ! interleaved ) indexWriter[ i ] = new BitStreamHPIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height ); else if ( ! skips ) indexWriter[ i ] = new BitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, writerFlags ); else indexWriter[ i ] = new SkipBitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height ); if ( haveCounts ) localGlobCounts[ i ] = new OutputBitStream( name + DiskBasedIndex.GLOBCOUNTS_EXTENSION ); localFrequencies[ i ] = new OutputBitStream( name + DiskBasedIndex.FREQUENCIES_EXTENSION ); localTerms[ i ] = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) ); } terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ); } private void partitionSizes() throws IOException { final File sizesFile = new File( inputBasename + DiskBasedIndex.SIZES_EXTENSION ); if ( sizesFile.exists() ) { LOGGER.info( "Partitioning sizes..." ); final InputBitStream sizes = new InputBitStream ( sizesFile ); final OutputBitStream localSizes[] = new OutputBitStream[ numIndices ]; for ( int i = 0; i < numIndices; i++ ) localSizes[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -