📄 trecdocumentcollection.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package it.unimi.dsi.mg4j.document;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Sebastiano Vigna * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedInputStream;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.ObjectArrays;import it.unimi.dsi.fastutil.objects.ObjectIterator;import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.SegmentedInputStream;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;import it.unimi.dsi.mg4j.util.MG4JClassParser;import it.unimi.dsi.logging.ProgressLogger;import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.Serializable;import java.io.UnsupportedEncodingException;import java.lang.reflect.InvocationTargetException;import java.util.Arrays;import java.util.zip.GZIPInputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** A collection for the TREC GOV2 data set. *  * <p>The documents are stored as a set of descriptors, representing the (possibly gzipped) file * they are contained in and the start and stop position in that file. To manage * descriptors later we rely on {@link SegmentedInputStream}. *  * <p>To interpret a file, we read up to <samp>&lt;DOC&gt;</samp> and place a start * marker there, we advance to the header and store the URI. An intermediate * marker is placed at the end of the doc header tag and a stop marker just * before <samp>&lt;/DOC&gt;</samp>. *  * <p>The resulting {@link SegmentedInputStream} has two segments * per document. By using a {@link it.unimi.dsi.mg4j.document.CompositeDocumentFactory}, the * first segment is parsed by a {@link it.unimi.dsi.mg4j.document.TRECHeaderDocumentFactory}, * whereas the second segment is parsed by a user-provided factory&mdash;usually, * an {@link it.unimi.dsi.mg4j.document.HtmlDocumentFactory}. *  * <p>The collection provides both sequential access to all documents via the * iterator and random access to a given document. However, the two operations * are performed very differently as the sequential operation is much more * performant than calling {@link #document(int)} repeatedly. *  * @author Alessio Orlandi * @author Luca Natali */public class TRECDocumentCollection extends AbstractDocumentCollection implements Serializable {	private static final Logger LOGGER = Logger.getLogger( TRECDocumentCollection.class );	private static final long serialVersionUID = -4251461013312968454L;		private static final boolean DEBUG = false;	/** Default buffer size, set up after some experiments. */	public static final String DEFAULT_BUFFER_SIZE = "64Ki";	/** The list of the files containing the documents. */	private String[] file;	/** Whether the files in {@link #file} are gzipped. */	private final boolean useGzip;	/** The document factory. */	protected DocumentFactory factory;	/** The list of document descriptors.  We assume that descriptors within the same file are contiguous */	protected transient ObjectArrayList<TRECDocumentDescriptor> descriptors;	/** The buffer size. */	private final int bufferSize;	/** The last returned stream. */	private SegmentedInputStream lastStream;	/** A compact description of the location and of the internal segmentation of	 * a TREC document inside a file. 	 */		private static class TRECDocumentDescriptor implements Cloneable {		/** A reference to the file containing this document. */		public int fileIndex;		/** The starting position of this document in the file. */		public long startMarker;		/** The starting position of the content of this document in the file. */		public int intermediateMarkerDiff;		/** The ending position. */		public int stopMarkerDiff;		// TODO: this computation should be moved in the caller		public TRECDocumentDescriptor(int findex, long start, long intermediateMarker, long stop) {			this.fileIndex = findex;			this.startMarker = start;			this.intermediateMarkerDiff = (int) (intermediateMarker - start);			this.stopMarkerDiff = (int) (stop - start);		}		public TRECDocumentDescriptor(int findex, long start,				int intermediateMarkerDiff, int stopMarkerDiff) {			this.fileIndex = findex;			this.startMarker = start;			this.intermediateMarkerDiff = intermediateMarkerDiff;			this.stopMarkerDiff = stopMarkerDiff;		}		public final long[] toSegments() {			return new long[] { startMarker, startMarker + intermediateMarkerDiff, stopMarkerDiff + startMarker };		}		public Object clone() {			return new TRECDocumentDescriptor(this.fileIndex, this.startMarker,					this.startMarker + this.intermediateMarkerDiff,					this.stopMarkerDiff + this.startMarker);		}	}	protected final static byte[] DOC_OPEN, DOC_CLOSE, DOCNO_OPEN, DOCNO_CLOSE, DOCHDR_OPEN, DOCHDR_CLOSE;		static {		try {			DOC_OPEN = "<DOC>".getBytes( "ASCII" );			DOC_CLOSE = "</DOC>".getBytes( "ASCII" );			DOCNO_OPEN = "<DOCNO>".getBytes( "ASCII" );			DOCNO_CLOSE = "</DOCNO>".getBytes( "ASCII" );			DOCHDR_OPEN = "<DOCHDR>".getBytes( "ASCII" );			DOCHDR_CLOSE = "</DOCHDR>".getBytes( "ASCII" );		}		catch ( UnsupportedEncodingException cantHappen ) {			throw new RuntimeException( cantHappen );		}	}		protected static boolean equals( byte[] a, int len, byte[] b ) {		if ( len != b.length ) return false;		while( len-- != 0 ) if ( a[ len ] != b[ len ] ) return false;		return true;	}	byte buffer[] = new byte[ 8 * 1024 ];	private void parseContent( int fileIndex, InputStream is ) throws IOException {		long currStart, currStop, currInter, oldPos;		boolean pastHeader = false, startedBlock = false;		LOGGER.debug( "Processing file " + fileIndex + " (" + file[ fileIndex ] + ")" );		FastBufferedInputStream fbis = new FastBufferedInputStream( is, bufferSize );		currStart = 0; // make java compiler happy.		currInter = 0;		oldPos = 0;				int l;				while ( ( l = fbis.readLine( buffer ) ) != -1 ) {			if ( l == buffer.length ) {				// We filled the buffer, which means we have a very very long line. Let's skip it.				while ( ( l = fbis.readLine( buffer ) ) == buffer.length );			}			else {				if ( !startedBlock && equals( buffer, l, DOC_OPEN ) ) {					currStart = oldPos;					startedBlock = true; // Start of the current block (includes <DOC> marker)				}				else if ( startedBlock && equals( buffer, l, DOC_CLOSE ) ) {					currStop = oldPos;					if ( DEBUG ) LOGGER.debug( "Setting markers <" + currStart + "," + currInter + ", " + currStop + ">" );					descriptors.add( new TRECDocumentDescriptor( fileIndex, currStart, currInter, currStop ) );					startedBlock = pastHeader = false;				}				else if ( startedBlock && !pastHeader && equals( buffer, l, DOCHDR_CLOSE ) ) {					currInter = fbis.position();					pastHeader = true;				}				oldPos = fbis.position();			}		}		fbis.close();	}	/**	 * Copy constructor (that is, the one used by {@link #copy()}. Just	 * initializes final fields	 */	protected TRECDocumentCollection( String[] file, DocumentFactory factory, ObjectArrayList<TRECDocumentDescriptor> descriptors, int bufferSize, boolean useGzip ) {		this.useGzip = useGzip;		this.file = file;		this.bufferSize = bufferSize;		this.factory = factory;		this.descriptors = descriptors;	}	public TRECDocumentCollection copy() {		return new TRECDocumentCollection( file, factory.copy(), descriptors, bufferSize, useGzip );
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -