📄 combine.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
package it.unimi.dsi.mg4j.tool;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.AbstractIntIterator;import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.index.CompressionFlags;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.IndexReader;import it.unimi.dsi.mg4j.index.IndexWriter;import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;import it.unimi.dsi.mg4j.index.TermProcessor;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.lang.ObjectParser;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.Properties;import java.io.BufferedWriter;import java.io.Closeable;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintStream;import java.io.PrintWriter;import java.lang.reflect.InvocationTargetException;import java.net.URISyntaxException;import java.util.Arrays;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.configuration.ConfigurationMap;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;import com.martiansoftware.jsap.stringparsers.FileStringParser;/** Combines several indices. *  * <p>Indices may be combined in several different ways. This abstract class * contains code that is common to classes such as {@link it.unimi.dsi.mg4j.tool.Merge} * or {@link it.unimi.dsi.mg4j.tool.Concatenate}: essentially, command line parsing, * inded opening, and term list fusion is taken care of. Then, the template method * {@link #combine(int)} must write into {@link #indexWriter} the combined inverted * list, returning the resulting frequency. *  * <p>Note that by combining a single index into a new one you can recompress an index * with different compression parameters (which includes the possibility of eliminating * positions or counts). *  * <p>The subclasses of this class must implement {@link #combine(int)} so that indices * with different sets of features are combined keeping the largest set of features requested * by the user. For instance, combining an index with positions and an index with counts, but * no positions, should generate an index with counts but no positions.  * * <p><strong>Warning</strong>: a combination requires opening <em>three</em> files per input index, * plus a few more files for the output index. If the combination process is interrupted by * an exception claiming that there are too many open files, check how to increase the * number of files you can open (usually, for instance on UN*X, there is a global and a per-process limit, * so be sure to set both). *  * <h2>Read-once indices, readers, and distributed index combination</h2> *  * <p>If the {@linkplain it.unimi.dsi.mg4j.index.Index indices} and  * {@linkplain it.unimi.dsi.mg4j.index.BitStreamIndexReader bitstream index readers} involved in the * combination are <em>read-once</em> (i.e., opening an index and reading once its contents sequentially * causes each file composing the index to be read exactly once)  * <em>then also {@link it.unimi.dsi.mg4j.tool.Combine} implementations should be read-once</em> ({@link it.unimi.dsi.mg4j.tool.Concatenate}, * {@link it.unimi.dsi.mg4j.tool.Merge} and {@link it.unimi.dsi.mg4j.tool.Paste} are). *  * <p>This means, in particular, that index combination can be performed from <em>pipes</em>, which in * turn can be filled, for instance, with data coming from the network. In other words, albeit this * class is theoretically based on a number of indices existing on a local disk, those indices can be * substituted with suitable pipes filled with remote data without affecting the combination process. * For instance, the following <samp>bash</samp> code creates three sets of pipes: * <pre style="margin: 1em 0"> * for i in 0 1 2; do *   for e in frequencies globcounts index offsets properties sizes terms; do  *     mkfifo pipe$i.$e *   done * done * </pre>  *  * <p>Each pipe should be then filled with suitable data, for instance obtained from the net (assuming * you have indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>): * <pre style="margin: 1em 0"> * for i in 0 1 2; do  *   for e in frequencies globcounts index offsets properties sizes terms; do  *     (ssh -x example.com cat index$i.$e >pipe$i.$e &) *   done * done * </pre>  * <p>Now all pipes will be filled with data from the corresponding remote files, and * combining the indices <samp>pipe0</samp>, <samp>pipe1</samp> and <samp>pipe2</samp> * will give the same result as combining <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> * on the remote system. *  * @author Sebastiano Vigna * @since 1.0 */public abstract class Combine {	private static final Logger LOGGER = Util.getLogger( Combine.class );	private final static boolean DEBUG = false;	/** The default buffer size. */	public static final int DEFAULT_BUFFER_SIZE = 1024 * 1024;		/** The number of indices to be merged. */	final protected int numIndices;	/** The array of indices to be merged. */	final protected BitStreamIndex[] index;	/** An array of index readers parallel to {@link #index}. */	final protected IndexReader[] indexReader;	/** An array of index iterators parallel to {@link #index} (filled by concrete implementations). */	final protected IndexIterator[] indexIterator;	/** An array of input bit streams, returning the global counts for each index. */	private final InputBitStream[] globCounts;	/** Whether to output global counts. */	private boolean writeGlobCounts; 	/** Whether to output sizes. */	private boolean writeSizes; 	/** Compute only index metadata (sizes, terms and globcounts). */	private final boolean metadataOnly; 	/** An array of mutable strings, containing the last term read for a given index. */	private MutableString[] term;	/** An array of fast buffered readers, used to read the terms of each index. */	private FastBufferedReader[] termReader;	/** The queue containing terms. */	protected ObjectHeapSemiIndirectPriorityQueue<MutableString> termQueue;	/** The overall number of documents. */	protected final int numberOfDocuments;	/** The overall number of occurrences. */	protected long numberOfOccurrences;	/** The maximum count in the merged index. */	protected int maxCount;	/** The array of input basenames. */	protected final String[] inputBasename;	/** The output basename. */	private final String outputBasename;	/** The size of I/O buffers. */	private final int bufferSize;	/** The logging interval. */	private final long logInterval;	/** The index writer for the merged index. */ 	protected IndexWriter indexWriter;	/** Whether {@link #indexWriter} has counts. */	protected final boolean hasCounts;	/** Whether {@link #indexWriter} has positions. */	protected final boolean hasPositions;	/** Whether {@link #indexWriter} has payloads. */	protected final boolean hasPayloads;	/** Additional properties for the merged index. */	private Properties additionalProperties;	/** An array partially filled with the indices (as offsets in {@link #index}) participating to the merge process for the current term. */	protected int[] usedIndex;	/** For each index, the frequency of the current term (given that it is present). */	final protected int[] frequency;	/** A cache for positions. */	protected int[] position;	/** The size of each document. */	protected int[] size;		public Combine( final String outputBasename,			final String[] inputBasename,			final boolean metadataOnly,
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -