📄 combine.java
字号:
package it.unimi.dsi.mg4j.tool;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.ints.AbstractIntIterator;import it.unimi.dsi.fastutil.ints.IntIterator;import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;import it.unimi.dsi.mg4j.index.BitStreamHPIndexWriter;import it.unimi.dsi.mg4j.index.BitStreamIndex;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.index.CompressionFlags;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexIterator;import it.unimi.dsi.mg4j.index.IndexReader;import it.unimi.dsi.mg4j.index.IndexWriter;import it.unimi.dsi.mg4j.index.SkipBitStreamIndexWriter;import it.unimi.dsi.mg4j.index.TermProcessor;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.Util;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.lang.ObjectParser;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.Properties;import java.io.BufferedWriter;import java.io.Closeable;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintStream;import java.io.PrintWriter;import java.lang.reflect.InvocationTargetException;import java.net.URISyntaxException;import java.util.Arrays;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.commons.configuration.ConfigurationMap;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;import com.martiansoftware.jsap.stringparsers.FileStringParser;/** Combines several indices. * * <p>Indices may be combined in several different ways. This abstract class * contains code that is common to classes such as {@link it.unimi.dsi.mg4j.tool.Merge} * or {@link it.unimi.dsi.mg4j.tool.Concatenate}: essentially, command line parsing, * inded opening, and term list fusion is taken care of. Then, the template method * {@link #combine(int)} must write into {@link #indexWriter} the combined inverted * list, returning the resulting frequency. * * <p>Note that by combining a single index into a new one you can recompress an index * with different compression parameters (which includes the possibility of eliminating * positions or counts). * * <p>The subclasses of this class must implement {@link #combine(int)} so that indices * with different sets of features are combined keeping the largest set of features requested * by the user. For instance, combining an index with positions and an index with counts, but * no positions, should generate an index with counts but no positions. * * <p><strong>Warning</strong>: a combination requires opening <em>three</em> files per input index, * plus a few more files for the output index. If the combination process is interrupted by * an exception claiming that there are too many open files, check how to increase the * number of files you can open (usually, for instance on UN*X, there is a global and a per-process limit, * so be sure to set both). * * <h2>Read-once indices, readers, and distributed index combination</h2> * * <p>If the {@linkplain it.unimi.dsi.mg4j.index.Index indices} and * {@linkplain it.unimi.dsi.mg4j.index.BitStreamIndexReader bitstream index readers} involved in the * combination are <em>read-once</em> (i.e., opening an index and reading once its contents sequentially * causes each file composing the index to be read exactly once) * <em>then also {@link it.unimi.dsi.mg4j.tool.Combine} implementations should be read-once</em> ({@link it.unimi.dsi.mg4j.tool.Concatenate}, * {@link it.unimi.dsi.mg4j.tool.Merge} and {@link it.unimi.dsi.mg4j.tool.Paste} are). * * <p>This means, in particular, that index combination can be performed from <em>pipes</em>, which in * turn can be filled, for instance, with data coming from the network. In other words, albeit this * class is theoretically based on a number of indices existing on a local disk, those indices can be * substituted with suitable pipes filled with remote data without affecting the combination process. * For instance, the following <samp>bash</samp> code creates three sets of pipes: * <pre style="margin: 1em 0"> * for i in 0 1 2; do * for e in frequencies globcounts index offsets properties sizes terms; do * mkfifo pipe$i.$e * done * done * </pre> * * <p>Each pipe should be then filled with suitable data, for instance obtained from the net (assuming * you have indices <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> on <samp>example.com</samp>): * <pre style="margin: 1em 0"> * for i in 0 1 2; do * for e in frequencies globcounts index offsets properties sizes terms; do * (ssh -x example.com cat index$i.$e >pipe$i.$e &) * done * done * </pre> * <p>Now all pipes will be filled with data from the corresponding remote files, and * combining the indices <samp>pipe0</samp>, <samp>pipe1</samp> and <samp>pipe2</samp> * will give the same result as combining <samp>index0</samp>, <samp>index1</samp> and <samp>index2</samp> * on the remote system. * * @author Sebastiano Vigna * @since 1.0 */public abstract class Combine { private static final Logger LOGGER = Util.getLogger( Combine.class ); private final static boolean DEBUG = false; /** The default buffer size. */ public static final int DEFAULT_BUFFER_SIZE = 1024 * 1024; /** The number of indices to be merged. */ final protected int numIndices; /** The array of indices to be merged. */ final protected BitStreamIndex[] index; /** An array of index readers parallel to {@link #index}. */ final protected IndexReader[] indexReader; /** An array of index iterators parallel to {@link #index} (filled by concrete implementations). */ final protected IndexIterator[] indexIterator; /** An array of input bit streams, returning the global counts for each index. */ private final InputBitStream[] globCounts; /** Whether to output global counts. */ private boolean writeGlobCounts; /** Whether to output sizes. */ private boolean writeSizes; /** Compute only index metadata (sizes, terms and globcounts). */ private final boolean metadataOnly; /** An array of mutable strings, containing the last term read for a given index. */ private MutableString[] term; /** An array of fast buffered readers, used to read the terms of each index. */ private FastBufferedReader[] termReader; /** The queue containing terms. */ protected ObjectHeapSemiIndirectPriorityQueue<MutableString> termQueue; /** The overall number of documents. */ protected final int numberOfDocuments; /** The overall number of occurrences. */ protected long numberOfOccurrences; /** The maximum count in the merged index. */ protected int maxCount; /** The array of input basenames. */ protected final String[] inputBasename; /** The output basename. */ private final String outputBasename; /** The size of I/O buffers. */ private final int bufferSize; /** The logging interval. */ private final long logInterval; /** The index writer for the merged index. */ protected IndexWriter indexWriter; /** Whether {@link #indexWriter} has counts. */ protected final boolean hasCounts; /** Whether {@link #indexWriter} has positions. */ protected final boolean hasPositions; /** Whether {@link #indexWriter} has payloads. */ protected final boolean hasPayloads; /** Additional properties for the merged index. */ private Properties additionalProperties; /** An array partially filled with the indices (as offsets in {@link #index}) participating to the merge process for the current term. */ protected int[] usedIndex; /** For each index, the frequency of the current term (given that it is present). */ final protected int[] frequency; /** A cache for positions. */ protected int[] position; /** The size of each document. */ protected int[] size; public Combine( final String outputBasename, final String[] inputBasename, final boolean metadataOnly,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -