📄 partitionlexically.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.tool;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;import it.unimi.dsi.mg4j.index.BitStreamHPIndex;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.cluster.ContiguousLexicalStrategy;import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.cluster.LexicalCluster;import it.unimi.dsi.mg4j.index.cluster.LexicalPartitioningStrategy;import it.unimi.dsi.mg4j.index.cluster.LexicalStrategies;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.mg4j.search.score.BM25Scorer;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.BloomFilter;import it.unimi.dsi.util.ImmutableExternalPrefixMap;import it.unimi.dsi.util.PrefixMap;import it.unimi.dsi.util.Properties;import it.unimi.dsi.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.StringMap;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.io.OutputStreamWriter;import java.io.PrintWriter;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.configuration.ConfigurationMap;import org.apache.commons.io.IOUtils;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** Partitions an index lexically. *  * <p>A global index is partitioned lexically by providing a {@link LexicalPartitioningStrategy} * that specifies a destination local index for each term, and a local term number. The global index * is read directly at the bit level, and the posting lists are divided among the  * local indices using the provided strategy. For instance, * an {@link ContiguousLexicalStrategy} divides an index into  * contiguous blocks (of terms) specified by the given strategy. *  * <p>By choice, document pointers are not remapped. Thus, it may happen that one of the local indices  * contains <em>no</em> posting with a certain document. However, computing the subset of documents contained * in each local index to remap them in a contiguous interval is not a good idea, as usually the subset * of documents appearing in the postings of each local index is large. * * <p>To speed up the search of the right local index of a not-so-frequent term (in * particular with a {@linkplain it.unimi.dsi.mg4j.index.cluster.ChainedLexicalClusteringStrategy chained strategy}),  * after partitioning an index you can create {@linkplain BloomFilter Bloom filters} that will be used to try to avoid * inquiring indices that do not contain a term. The filters will be automatically loaded * by {@link it.unimi.dsi.mg4j.index.cluster.IndexCluster#getInstance(CharSequence, boolean, boolean)}. *  * <p>Note that the size file is the same for each local index and <em>is not copied</em>. Please use * standard operating system features such as symbolic links to provide size files to  * local indices.  *  * <p>If you plan to {@linkplain LexicalCluster cluster} the partitioned indices and you need document sizes  * (e.g., for {@linkplain BM25Scorer BM25 scoring}), you can use the index property  * {@link it.unimi.dsi.mg4j.index.Index.UriKeys#SIZES} to load the original size file.   *  * If you plan on partitioning an index requiring * document sizes, you should consider a custom index loading scheme  * that shares the {@linkplain it.unimi.dsi.mg4j.index.BitStreamIndex#sizes size list} * among all local indices. * * <strong>Important</strong>: this class just partitions the index. No auxiliary files (most notably, {@linkplain StringMap term maps}  * or {@linkplain PrefixMap prefix maps}) will be generated. Please refer to a {@link StringMap} implementation (e.g., * {@link ShiftAddXorSignedStringMap} or {@link ImmutableExternalPrefixMap}). * * <h2>Write-once output and distributed index partitioning</h2> *  * <p>The partitioning process writes each index file sequentially exactly once, so index partitioning * can output its results to <em>pipes</em>, which in * turn can spill their content, for instance, through the network. In other words, albeit this * class theoretically creates a number of local indices on disk, those indices can be * substituted with suitable pipes creating remote local indices without affecting the partitioning process. * For instance, the following <samp>bash</samp> code creates three sets of pipes: * <pre style="margin: 1em 0"> * for i in 0 1 2; do *   for e in frequencies globcounts index offsets properties sizes terms; do  *     mkfifo pipe-$i.$e *   done * done * </pre>  *  * <p>Each pipe must be emptied elsewhere, for instance (assuming * you want local indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>): * <pre style="margin: 1em 0"> * for i in 0 1 2; do  *   for e in frequencies globcounts index offsets properties sizes terms; do  *     (cat pipe-$i.$e | ssh -x example.com "cat >index-$i.$e" &) *   done * done * </pre>  * <p>If we now start a partitioning process generating three local indices named <samp>pipe-0</samp>, * <samp>pipe-1</samp> and <samp>pipe-2</samp> * all pipes will be written to by the process, and the data will create remotely * indices <samp>index-0</samp>, <samp>index-1</samp> and <samp>index-2</samp>. * * @author Sebastiano Vigna *  * @since 1.0.1 */public class PartitionLexically {	private static final Logger LOGGER = Util.getLogger( PartitionLexically.class );	/**  The default buffer size for all involved indices. */	public final static int DEFAULT_BUFFER_SIZE = 1024 * 1024;		/** The number of local indices. */	private final int numIndices;	/** The output basenames. */	private final String outputBasename;	/** The array of local output basenames. */	private final String[] localBasename;	/** The input basename. */	private final String inputBasename;	/** The size of I/O buffers. */	private final int bufferSize;	/** The filename of the strategy used to partition the index. */	private final String strategyFilename;	/** The strategy used to partition the index. */	private final LexicalPartitioningStrategy strategy;	/** The additional local properties of each local index. */	private final Properties[] strategyProperties;	/** The logging interval. */	private final long logInterval;			public PartitionLexically( final String inputBasename, 			final String outputBasename,			final LexicalPartitioningStrategy strategy,			final String strategyFilename,			final int bufferSize,			final long logInterval ) {		this.inputBasename = inputBasename;		this.outputBasename = outputBasename;		this.strategy = strategy;		this.strategyFilename = strategyFilename;		this.bufferSize = bufferSize;		this.logInterval = logInterval;		numIndices = strategy.numberOfLocalIndices();		strategyProperties = strategy.properties();		localBasename = new String[ numIndices ];		for( int i = 0; i < numIndices; i++ ) localBasename[ i ] = outputBasename + "-" + i;	}		public void runTermsOnly() throws IOException {		final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );				final PrintWriter[] localTerms = new PrintWriter[ numIndices ]; 		final int numTerms[] = new int[ numIndices ];		final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );				for( int i = 0; i < numIndices; i++ ) localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );		// The current term		final MutableString currTerm = new MutableString();				pl.itemsName = "terms";		pl.logInterval = logInterval;		pl.start( "Partitioning index terms..." );		int termNumber = 0, k;				while( terms.readLine( currTerm ) != null ) {			k = strategy.localIndex( termNumber ); // The local index for this term			if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();			numTerms[ k ]++;			currTerm.println( localTerms[ k ] );			pl.update();
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -