📄 trecdocumentcollection.java
字号:
package it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.ObjectArrays;import it.unimi.dsi.fastutil.objects.ObjectIterator;import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.SegmentedInputStream;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;import it.unimi.dsi.mg4j.util.MG4JClassParser;import it.unimi.dsi.logging.ProgressLogger;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.Serializable;import java.io.UnsupportedEncodingException;import java.lang.reflect.InvocationTargetException;import java.util.Arrays;import java.util.zip.GZIPInputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** A collection for the TREC GOV2 data set. * * <p>The documents are stored as a set of descriptors, representing the (possibly gzipped) file * they are contained in and the start and stop position in that file. To manage * descriptors later we rely on {@link SegmentedInputStream}. * * <p>To interpret a file, we read up to <samp><DOC></samp> and place a start * marker there, we advance to the header and store the URI. An intermediate * marker is placed at the end of the doc header tag and a stop marker just * before <samp></DOC></samp>. * * <p>The resulting {@link SegmentedInputStream} has two segments * per document. By using a {@link it.unimi.dsi.mg4j.document.CompositeDocumentFactory}, the * first segment is parsed by a {@link it.unimi.dsi.mg4j.document.TRECHeaderDocumentFactory}, * whereas the second segment is parsed by a user-provided factory—usually, * an {@link it.unimi.dsi.mg4j.document.HtmlDocumentFactory}. * * <p>The collection provides both sequential access to all documents via the * iterator and random access to a given document. However, the two operations * are performed very differently as the sequential operation is much more * performant than calling {@link #document(int)} repeatedly. * * @author Alessio Orlandi * @author Luca Natali */public class TRECDocumentCollection extends AbstractDocumentCollection implements Serializable { private static final Logger LOGGER = Logger.getLogger( TRECDocumentCollection.class ); private static final long serialVersionUID = -4251461013312968454L; private static final boolean DEBUG = false; /** Default buffer size, set up after some experiments. */ public static final String DEFAULT_BUFFER_SIZE = "64Ki"; /** The list of the files containing the documents. */ private String[] file; /** Whether the files in {@link #file} are gzipped. */ private final boolean useGzip; /** The document factory. */ protected DocumentFactory factory; /** The list of document descriptors. We assume that descriptors within the same file are contiguous */ protected transient ObjectArrayList<TRECDocumentDescriptor> descriptors; /** The buffer size. */ private final int bufferSize; /** The last returned stream. */ private SegmentedInputStream lastStream; /** A compact description of the location and of the internal segmentation of * a TREC document inside a file. */ private static class TRECDocumentDescriptor implements Cloneable { /** A reference to the file containing this document. */ public int fileIndex; /** The starting position of this document in the file. */ public long startMarker; /** The starting position of the content of this document in the file. */ public int intermediateMarkerDiff; /** The ending position. */ public int stopMarkerDiff; // TODO: this computation should be moved in the caller public TRECDocumentDescriptor(int findex, long start, long intermediateMarker, long stop) { this.fileIndex = findex; this.startMarker = start; this.intermediateMarkerDiff = (int) (intermediateMarker - start); this.stopMarkerDiff = (int) (stop - start); } public TRECDocumentDescriptor(int findex, long start, int intermediateMarkerDiff, int stopMarkerDiff) { this.fileIndex = findex; this.startMarker = start; this.intermediateMarkerDiff = intermediateMarkerDiff; this.stopMarkerDiff = stopMarkerDiff; } public final long[] toSegments() { return new long[] { startMarker, startMarker + intermediateMarkerDiff, stopMarkerDiff + startMarker }; } public Object clone() { return new TRECDocumentDescriptor(this.fileIndex, this.startMarker, this.startMarker + this.intermediateMarkerDiff, this.stopMarkerDiff + this.startMarker); } } protected final static byte[] DOC_OPEN, DOC_CLOSE, DOCNO_OPEN, DOCNO_CLOSE, DOCHDR_OPEN, DOCHDR_CLOSE; static { try { DOC_OPEN = "<DOC>".getBytes( "ASCII" ); DOC_CLOSE = "</DOC>".getBytes( "ASCII" ); DOCNO_OPEN = "<DOCNO>".getBytes( "ASCII" ); DOCNO_CLOSE = "</DOCNO>".getBytes( "ASCII" ); DOCHDR_OPEN = "<DOCHDR>".getBytes( "ASCII" ); DOCHDR_CLOSE = "</DOCHDR>".getBytes( "ASCII" ); } catch ( UnsupportedEncodingException cantHappen ) { throw new RuntimeException( cantHappen ); } } protected static boolean equals( byte[] a, int len, byte[] b ) { if ( len != b.length ) return false; while( len-- != 0 ) if ( a[ len ] != b[ len ] ) return false; return true; } byte buffer[] = new byte[ 8 * 1024 ]; private void parseContent( int fileIndex, InputStream is ) throws IOException { long currStart, currStop, currInter, oldPos; boolean pastHeader = false, startedBlock = false; LOGGER.debug( "Processing file " + fileIndex + " (" + file[ fileIndex ] + ")" ); FastBufferedInputStream fbis = new FastBufferedInputStream( is, bufferSize ); currStart = 0; // make java compiler happy. currInter = 0; oldPos = 0; int l; while ( ( l = fbis.readLine( buffer ) ) != -1 ) { if ( l == buffer.length ) { // We filled the buffer, which means we have a very very long line. Let's skip it. while ( ( l = fbis.readLine( buffer ) ) == buffer.length ); } else { if ( !startedBlock && equals( buffer, l, DOC_OPEN ) ) { currStart = oldPos; startedBlock = true; // Start of the current block (includes <DOC> marker) } else if ( startedBlock && equals( buffer, l, DOC_CLOSE ) ) { currStop = oldPos; if ( DEBUG ) LOGGER.debug( "Setting markers <" + currStart + "," + currInter + ", " + currStop + ">" ); descriptors.add( new TRECDocumentDescriptor( fileIndex, currStart, currInter, currStop ) ); startedBlock = pastHeader = false; } else if ( startedBlock && !pastHeader && equals( buffer, l, DOCHDR_CLOSE ) ) { currInter = fbis.position(); pastHeader = true; } oldPos = fbis.position(); } } fbis.close(); } /** * Copy constructor (that is, the one used by {@link #copy()}. Just * initializes final fields */ protected TRECDocumentCollection( String[] file, DocumentFactory factory, ObjectArrayList<TRECDocumentDescriptor> descriptors, int bufferSize, boolean useGzip ) { this.useGzip = useGzip; this.file = file; this.bufferSize = bufferSize; this.factory = factory; this.descriptors = descriptors; } public TRECDocumentCollection copy() { return new TRECDocumentCollection( file, factory.copy(), descriptors, bufferSize, useGzip );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -