📄 partitiondocumentally.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.tool;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.IntList;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.CachingOutputBitStream;import it.unimi.dsi.mg4j.index.CompressionFlags;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.IndexReader;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.index.IndexWriter;import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalPartitioningStrategy;import it.unimi.dsi.mg4j.index.cluster.DocumentalStrategies;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.BloomFilter;import it.unimi.dsi.util.ImmutableExternalPrefixMap;import it.unimi.dsi.util.PrefixMap;import it.unimi.dsi.util.Properties;import it.unimi.dsi.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.StringMap;import java.io.BufferedWriter;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintWriter;import java.net.URISyntaxException;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.configuration.ConfigurationMap;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** Partitions an index documentally. * * <p>A global index is partitioned documentally by providing a {@link DocumentalPartitioningStrategy} * that specifies a destination local index for each document, and a local document pointer. The global index * is scanned, and the postings are partitioned among the local indices using the provided strategy. For instance, * a {@link ContiguousDocumentalStrategy} divides an index into blocks of contiguous documents. *  * <p>Since each local index contains a (proper) subset of the original set of documents, it contains in general a (proper) * subset of the terms in the global index. Thus, the local term numbers and the global term numbers will not in general coincide. * As a result, when a set of local indices is accessed transparently as a single index * using a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster},  * a call to {@link it.unimi.dsi.mg4j.index.Index#documents(int)} will throw an {@link java.lang.UnsupportedOperationException}, * because there is no way to map the global term numbers to local term numbers. *  * <p>On the other hand, a call to {@link it.unimi.dsi.mg4j.index.Index#documents(CharSequence)} will be passed each local index to * build a global iterator. To speed up this phase for not-so-frequent terms, when partitioning an index you can require * the construction of {@linkplain BloomFilter Bloom filters} that will be used to try to avoid * inquiring indices that do not contain a term. The precision of the filters is settable. * * <p>The property file will use a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster} unless you provide * a {@link ContiguousDocumentalStrategy}, in which case a  * {@link it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster} will be used instead. Note that there might * be other cases in which the latter is adapt, in which case you can edit manually the property file. *  * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps}  * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g., * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}). *  * <h2>Write-once output and distributed index partitioning</h2> *  * Plase see {@link it.unimi.dsi.mg4j.tool.PartitionLexically}&mdash;the same comments apply. *  * @author Alessandro Arrabito * @author Sebastiano Vigna *  * @since 1.0.1 */public class PartitionDocumentally {	private final static Logger LOGGER = Util.getLogger( PartitionDocumentally.class );	/**  The default buffer size for all involved indices. */	public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;		/** The number of local indices. */	private final int numIndices;	/** The output basenames. */	private final String outputBasename;	/** The array of local output basenames. */	private final String[] localBasename;	/** The input basename. */	private final String inputBasename;	/** The properties of the input index. */	private final Properties inputProperties;	/** The size of I/O buffers. */	private final int bufferSize;	/** The filename of the strategy used to partition the index. */	private final String strategyFilename;	/** The strategy used to perform the partitioning. */	private final DocumentalPartitioningStrategy strategy;	/** The additional local properties of each local index. */	private final Properties[] strategyProperties;	/** The logging interval. */	private final long logInterval;	/** The global index to be partitioned. */	private final BitStreamIndex globalIndex;	/** A reader on {@link #globalIndex}. */	private final IndexReader indexReader;	/** A reader for the terms of the global index. */	private final FastBufferedReader terms;	/** An index writer for each local index. */	private final IndexWriter[] indexWriter;	/** Whether each {@link #indexWriter} has counts. */	private final boolean haveCounts;	/** Whether each {@link #indexWriter} has positions. */	private final boolean havePositions;	/** Whether each {@link #indexWriter} has payloads. */	private final boolean havePayloads;	/** A bit output stream for global counts of each local index. */	private final OutputBitStream[] localGlobCounts;	/** A bit output stream for the frequencies of each local index. */	private final OutputBitStream[] localFrequencies;	/** A print writer for the terms of each local index. */	private final PrintWriter[] localTerms;	/** The maximum size of a document in each local index. */	private final int[] maxDocSize;	/** The maximum number of positions in each local index. */	private final int[] maxDocPos;	/** The number of terms in each local index. */	private final int[] numTerms;	/** The number of postings in each local index. */	private final long[] numPostings;	/** The number of occurrences in each local index. */	private final long[] numOccurrences;	/** The global count for each local index. */	private final long[] globCount;	/** The required precision for Bloom filters (0 means no filter). */	private final int bloomFilterPrecision;		public PartitionDocumentally( final String inputBasename, 			final String outputBasename,			final DocumentalPartitioningStrategy strategy,			final String strategyFilename,			final int bloomFilterPrecision,			final int bufferSize,			final Map<Component,Coding> writerFlags,			boolean interleaved,			final boolean skips,			final int quantum,			final int height,			final int skipBufferSize,			final long logInterval ) throws ConfigurationException, IOException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException {		this.inputBasename = inputBasename;		this.outputBasename = outputBasename;		this.strategy = strategy;		this.strategyFilename = strategyFilename;		this.strategyProperties = strategy.properties();		this.bufferSize = bufferSize;		this.logInterval = logInterval;		this.bloomFilterPrecision = bloomFilterPrecision;		numIndices = strategy.numberOfLocalIndices();		final Coding positionCoding = writerFlags.get( Component.POSITIONS );		inputProperties = new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );		globalIndex = DiskBasedIndex.getInstance( inputBasename, inputProperties, false, positionCoding == Coding.GOLOMB || positionCoding == Coding.INTERPOLATIVE, false, null );		indexReader = globalIndex.getReader();		localBasename = new String[ numIndices ];		for( int i = 0; i < numIndices; i++ ) localBasename[ i ] = outputBasename + "-" + i;		localGlobCounts = new OutputBitStream[ numIndices ];		localFrequencies = new OutputBitStream[ numIndices ];		localTerms = new PrintWriter[ numIndices ];		maxDocSize = new int[ numIndices ];		maxDocPos = new int[ numIndices ];		numTerms = new int[ numIndices ];		globCount = new long[ numIndices ];		numOccurrences = new long[ numIndices ];		numPostings = new long[ numIndices ];		indexWriter = new IndexWriter[ numIndices ];				if ( ( havePayloads = writerFlags.containsKey( Component.PAYLOADS ) ) && ! globalIndex.hasPayloads ) 			throw new IllegalArgumentException( "You requested payloads, but the global index does not contain them." );		if ( ( haveCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! globalIndex.hasCounts ) 			throw new IllegalArgumentException( "You requested counts, but the global index does not contain them." );		if ( ( havePositions = writerFlags.containsKey( Component.POSITIONS ) ) && !  globalIndex.hasPositions ) 			throw new IllegalArgumentException( "You requested positions, but the global index does not contain them." );		interleaved |= ! havePositions || havePayloads;		for ( int i = 0; i < numIndices; i++ ) {			String name = localBasename[ i ]; 			if ( ! interleaved ) indexWriter[ i ] = new BitStreamHPIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );			else if ( ! skips ) indexWriter[ i ] = new BitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, writerFlags );			else indexWriter[ i ] = new SkipBitStreamIndexWriter( localBasename[ i ], strategy.numberOfDocuments( i ), true, skipBufferSize, writerFlags, quantum, height );						if ( haveCounts ) localGlobCounts[ i ] = new OutputBitStream( name + DiskBasedIndex.GLOBCOUNTS_EXTENSION );			localFrequencies[ i ] = new OutputBitStream( name + DiskBasedIndex.FREQUENCIES_EXTENSION );			localTerms[ i ] = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) );					}				terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );	}		private void partitionSizes() throws IOException {					final File sizesFile = new File( inputBasename + DiskBasedIndex.SIZES_EXTENSION );		if ( sizesFile.exists() ) {			LOGGER.info( "Partitioning sizes..." );			final InputBitStream sizes = new InputBitStream ( sizesFile );			final OutputBitStream localSizes[] = new OutputBitStream[ numIndices ];			for ( int i = 0; i < numIndices; i++ ) localSizes[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -