📄 scan.java
字号:
package it.unimi.dsi.mg4j.tool;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.Hash;import it.unimi.dsi.fastutil.ints.IntArrayList;import it.unimi.dsi.fastutil.ints.IntArrays;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;import it.unimi.dsi.fastutil.objects.ObjectList;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.mg4j.document.DocumentCollection;import it.unimi.dsi.mg4j.document.DocumentFactory;import it.unimi.dsi.mg4j.document.DocumentIterator;import it.unimi.dsi.mg4j.document.DocumentSequence;import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;import it.unimi.dsi.mg4j.document.InputStreamDocumentSequence;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;import it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder;import it.unimi.dsi.mg4j.index.BitStreamIndexWriter;import it.unimi.dsi.mg4j.index.CompressionFlags;import it.unimi.dsi.mg4j.index.DiskBasedIndex;import it.unimi.dsi.mg4j.index.DowncaseTermProcessor;import it.unimi.dsi.mg4j.index.FileIndex;import it.unimi.dsi.mg4j.index.Index;import it.unimi.dsi.mg4j.index.IndexWriter;import it.unimi.dsi.mg4j.index.NullTermProcessor;import it.unimi.dsi.mg4j.index.TermProcessor;import it.unimi.dsi.mg4j.index.CompressionFlags.Coding;import it.unimi.dsi.mg4j.index.CompressionFlags.Component;import it.unimi.dsi.mg4j.index.cluster.ContiguousDocumentalStrategy;import it.unimi.dsi.mg4j.index.cluster.DocumentalCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalConcatenatedCluster;import it.unimi.dsi.mg4j.index.cluster.DocumentalMergedCluster;import it.unimi.dsi.mg4j.index.cluster.IdentityDocumentalStrategy;import it.unimi.dsi.mg4j.index.cluster.IndexCluster;import it.unimi.dsi.mg4j.index.payload.DatePayload;import it.unimi.dsi.mg4j.index.payload.IntegerPayload;import it.unimi.dsi.mg4j.index.payload.Payload;import it.unimi.dsi.mg4j.io.ByteArrayPostingList;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.fastutil.io.FastByteArrayOutputStream;import it.unimi.dsi.io.InputBitStream;import it.unimi.dsi.io.OutputBitStream;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.Util;import it.unimi.dsi.mg4j.util.MG4JClassParser;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.lang.ObjectParser;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.util.Properties;import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;import java.io.File;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.OutputStreamWriter;import java.io.PrintWriter;import java.io.Reader;import java.io.Serializable;import java.lang.reflect.InvocationTargetException;import java.util.Arrays;import java.util.EnumMap;import java.util.Map;import org.apache.commons.configuration.ConfigurationException;import org.apache.log4j.Logger;import cern.colt.GenericSorting;import cern.colt.Sorting;import cern.colt.Swapper;import cern.colt.function.IntComparator;import cern.colt.function.LongComparator;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.ParseException;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;import com.martiansoftware.jsap.stringparsers.LongSizeStringParser;/** * Scans a document sequence, dividing it in batches of occurrences and writing for each batch a * corresponding subindex. * * <P>This class (more precisely, its * {@link #run(String, DocumentSequence, TermProcessor, String, int, int, int[], VirtualDocumentResolver[], int[], String, long, String) run()} * method) reads a document sequence and produces several <em>batches</em>, that is, subindices * corresponding to subsets of term/document pairs of the collection. A set of batches is generated * for each indexed field of the collection. A main method invokes the above method setting its * parameters using suitable options. * * <p>Unless a serialised {@link it.unimi.dsi.mg4j.document.DocumentSequence} is specified using * the suitable option, an implicit {@link it.unimi.dsi.mg4j.document.InputStreamDocumentSequence} * is created using separator byte (default is 10, i.e., newline). In the latter case, the factory * and its properties can be set with command-line options. * * <P>The only mandatory argument is a <em>basename</em>, which will be used to stem the names * of all files generated. The first batch of a field named <var>field</var> will use the basename * <samp><var>basename-field</var>@0</samp>, the second batch <samp><var>basename-field</var>@1</samp> * and so on. It is also possible to specify a separate directory for batch files (e.g., for easier * {@linkplain #cleanup(String, int, File) cleanup} when they are no longer necessary). * * <P>Since documents are read sequentially, every document has a <em>natural * index</em> starting * from 0. If no remapping (i.e., renumbering) is specified, the <em>document * index</em> of each document * corresponds to its natural index. If, however, a remapping is specified, under the form of a * list of integers, the document index of a document is the integer found in the corresponding * position of the list. More precisely, a remapping for <var>N</var> documents is a list of * <var>N</var> distinct integers, and a document with natural index <var>i</var> has document * index given by the <var>i</var>-th element of the list. This is useful when indexing statically * ranked documents (e.g., if you are indexing a part of the web and would like the index to return * documents with higher static rank first). If the remapping file is provided, it must be a * sequence of integers, written using the {@link java.io.DataOutputStream#writeInt(int)} method; if * <var>N</var> is the number of documents, the file is to contain exactly <var>N</var> distinct * integers. The integers need not be between 0 and <var>N</var>-1, to allow the remapping of * subindices (but a warning will be logged in this case, just to be sure you know what you're doing). * * <P>Also every term has an associated number starting from 0, assigned in lexicographic order. * * <h2>Index types and indexing types</h2> * * <p>A <em>standard index</em> contains a list of terms, and for each term a posting list. Each * posting contains mandatorily a document pointer, and then, optionally, the count and the * positions of the term (whether the last two elements appear can be specified using suitable * {@linkplain CompressionFlags compression flags}). * * <p>The indexing type of a standard index can be {@link IndexingType#STANDARD}, * {@link IndexingType#REMAPPED} or {@link IndexingType#VIRTUAL}. In the first case, we index the * words occurring in documents as usual. In the second case, before writing the index all documents * are renumbered following a provided map. In the third case (used only with * {@link it.unimi.dsi.mg4j.document.DocumentFactory.FieldType#VIRTUAL} fields) indexing is performed on a virtual document * obtained by collating a number of {@linkplain VirtualDocumentFragment fragments}. * Fragments are associated to documents by some key, * and a {@link VirtualDocumentResolver} turns a key into a document natural number, so that the * collation process can take place (a settable gap is inserted between fragments). * * <p>Besides storing document pointers, document counts, and position, MG4J makes it possible to * store an arbitrary <em>payload</em> with each posting. This feature is presently used only to * create <em>payload-based indices</em>—indices without counts and positions that contain * a single, dummy word <samp>#</samp>. They are actually used to store arbitrary data associated * to each document, such as dates and integers: using a special syntax, is then possible to specify * <em>range queries</em> on the values of such fields. * * <p>The main difference between standard and payload-based indices is that the first type is * handled by instances of this class, whereas the second type is handled by instances of * {@link Scan.PayloadAccumulator}. The * {@link #run(String, DocumentSequence, TermProcessor, String, int, int, int[], VirtualDocumentResolver[], int[], String, long, String) run()} * method creates a set of suitable instances, one for each indexed field, and feeds them in * parallel with data from the appropriate field of the same document. * * <h2>Batch subdivision and content</h2> * * <p>The scanning process uses a user-settable number of documents per batch, and will try to * build batches containing exactly that number of documents (for all indexed fields). There are of * course space constraints that could make building exact batches impossible, as the entire data of * a batch must into core memory. If memory is too low, a batch will be generated with fewer * documents than expected. * * <p>In some extreme cases, it could be impossible to produce cleanly a set of batches for all * fields: in that case, <em>emergency dumps</em> will create <em>fragmented batches</em>—instead * of a single batch containing <var>k</var> documents a certain field will generate two separate * batches. As a consequence, different fields will have a number of batches, but a simple * inspection of the property files (see below) will reveal the details of the emergency dumps (and * {@link Combine} can be used to rebuild the desired exact batches, if necessary). * * <p>The larger the number of documents in a batch is, the quicker index construction will be. * Usually, some experiments and a look at the logs is all that suffices to find out good parameters * for the Java virtual machine maximum memory setting and for the number of documents per batch. * * <P>These are the files currently generated for each batch (<samp><var>basename</var></samp> * denotes the basename of the batch, not of the index): * * <dl> * * <dt><samp><var>basename</var>.terms</samp> * * <dd>For each indexed term, the corresponding literal string in UTF-8 encoding. More precisely, * the <var>i</var>-th line of the file (starting from 0) contains the literal string corresponding * to term index <var>i</var>. * * <dt><samp><var>basename</var>.terms.unsorted</samp> * * <dd>The list of indexed terms in the same order in which they were met in the document * collection. This list is not produced unless you ask for it explicitly with a suitable option. * * <dt><samp><var>basename</var>.frequencies</samp> * * <dd>For each term, the number of documents in which the term appears in γ coding. More * precisely, <var>i</var>-th integer of the file (starting from 0) is the number of documents in * which the term of index <var>i</var> appears. * * <dt><samp><var>basename</var>.sizes</samp> (not generated for payload-based indices) * * <dd>For each indexed document, the corresponding size (=number of words) in γ coding. More * precisely, <var>i</var>-th integer of the file (starting from 0) is the size in words of the * document of index <var>i</var>. * * <dt><samp><var>basename</var>.index</samp> * * <dd>The inverted index. * * <dt><samp><var>basename</var>.offsets</samp> (not generated for payload-based indices) * * <dd>For each term, the bit offset in <samp><var>basename</var>.index</samp> at which the * inverted lists start. More precisely, the first integer is the offset for term 0 in γ * coding, and then the <var>i</var>-th integer is the difference between the <var>i</var>-th and * the <var>i</var>−1-th offset in γ coding. If <var>T</var> terms were indexed, this * file will contain <var>T</var>+1 integers, the last being the difference (in bits) between the * length of the entire inverted index and the offset of the last inverted list. * * <dt><samp><var>basename</var>.globcounts</samp> (not generated for payload-based indices) * * <dd>For each term, the number of its occurrences throughout the whole document collection, in * γ coding. More precisely, the <var>i</var>-th integer of the file (starting from 0) is the * number of occurrences of the term of index <var>i</var>. * * * <dt><samp><var>basename</var>.properties</samp> * * <dd>A Java {@linkplain Properties property file} containing information about the index. * Currently, the following keys (taken from {@link it.unimi.dsi.mg4j.index.Index.PropertyKeys}) * generated: * * <dl> <dt>indexclass <dd>the class used to generate the batch (presently, {@link BitStreamIndexWriter}); * <dt>documents <dd>number documents in the collection; <dt>terms <dd>number of indexed terms; * <dt>occurrences <dd>number of words throughout the whole collection; <dt>postings <dd>number * of postings (pairs term/document) throughout the whole collection; <dt>maxdocsize <dd>maximum * size of a document in words; <dt>termprocessor <dd>the term processor (if any) used during the * index construction; <dt>coding <dd>one or more items, each defining a key/pair value for the * <em>flag map</em> of the index; each pair is of the form <samp><var>component</var>:<var>coding</var></samp> * (see {@link it.unimi.dsi.mg4j.index.CompressionFlags}); <dt>field <dd>the name of the field * that generated this batch (optional) <dt>maxcount <dd>the maximum count in the collection, that * is, the maximum count of a term maximised on all terms and documents; <dt>size <dd>the index * size in bits; </dl> * * <dt><samp><var>basename</var>.cluster.properties</samp> * * <dd>A Java {@linkplain Properties property file} containing information about the set of batches * seen as a {@link it.unimi.dsi.mg4j.index.cluster.DocumentalCluster}. The keys are same as in the * previous case, but additionally a number of <samp>localindex</samp> entries specify the basename * of the batches, and a <samp>splitstrategy</samp>. After creating manually suitable term maps for * each batch, you will be able to access the set of batches as a single index (but note that * standard batches have <em>no skip structure</em>, and should not be used * in production; if you intend to do so, you have to write a customised scanning procedure). * * </dl> * * @author Sebastiano Vigna * @since 1.0 */public class Scan { private final static Logger LOGGER = Util.getLogger( Scan.class ); private final static boolean ASSERTS = false; public static enum IndexingType { /** A standard index—documents will be provided in increasing order. */ STANDARD,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -