⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 urlmphvirtualdocumentresolver.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.tool;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Paolo Boldi  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.bits.TransformationStrategies;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.io.LineIterator;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;import it.unimi.dsi.util.BloomFilter;import it.unimi.dsi.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.StringMap;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.net.URI;import java.net.URISyntaxException;import java.util.ArrayList;import java.util.Collection;import org.apache.commons.lang.RandomStringUtils;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.UnflaggedOption;/** A virtual-document resolver based on document URIs. * * <p>Instances of this class store in a {@link StringMap} instances * all URIs from a collection, and consider a virtual-document specification a (possibly relative) URI. The * virtual-document specification is resolved against the document URI, and then the perfect hash is used * to retrieve the corresponding document. *  * <p>This class provides a main method that helps in building serialised resolvers from URI lists. * In case of pathological document collections with duplicate URIs (most notably, the GOV2 collection * used for TREC evaluations), an option makes it possible to add random noise to duplicates, so that * minimal perfect hash construction does not go into an infinite loop. It is a rather crude solution, but it * is nonsensical to have duplicate URIs in the first place.  */public class URLMPHVirtualDocumentResolver implements VirtualDocumentResolver {	private static final long serialVersionUID = 1L;	private static final Logger LOGGER = Logger.getLogger( URLMPHVirtualDocumentResolver.class );		/** The term map used by this resolver to associated URI strings to numbers. */	private final StringMap<? extends CharSequence> url2DocumentPointer;	/** The cached URI of the last argument to {@link #context(Document)}. */	private transient URI documentURI;	public URLMPHVirtualDocumentResolver( final StringMap<? extends CharSequence> url2DocumentPointer ) {		this.url2DocumentPointer = url2DocumentPointer;	}	public void context( final Document document ) {		try {			documentURI = new URI( document.uri().toString() ).normalize();		}		catch ( URISyntaxException e ) {			documentURI = null;		}	}	public int resolve( final CharSequence virtualDocumentSpec ) {		try {			URI virtualURI = URI.create( virtualDocumentSpec.toString() ).normalize();			if ( ! virtualURI.isAbsolute() ) {				if ( documentURI == null ) return -1;				virtualURI = documentURI.resolve( virtualURI );			}			// TODO discard opaque?			return (int)url2DocumentPointer.getLong( virtualURI.toString() );		} catch ( Exception e ) {			return -1;		}	}	public int numberOfDocuments() {		return url2DocumentPointer.size();	}	private static void makeUnique( final BloomFilter filter, final MutableString uri ) {		while( ! filter.add( uri ) ) {			LOGGER.debug( "Duplicate URI " + uri );			uri.append( '/' ).append( RandomStringUtils.randomAlphanumeric( 32 ) );		}	}	@SuppressWarnings("unchecked")	public static void main( final String[] arg ) throws JSAPException, IOException {		final SimpleJSAP jsap = new SimpleJSAP( URLMPHVirtualDocumentResolver.class.getName(), "Builds a URL document resolver from a sequence of URIs, extracted typically using ScanMetadata.",				new Parameter[] {					new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b',  "buffer-size", "The size of the I/O buffer used to read terms." ),					//new FlaggedOption( "class", MG4JClassParser.getParser(), ShiftAddXorSignedMinimalPerfectHash.class.getName(), JSAP.NOT_REQUIRED, 'c', "class", "A subclass of MinimalPerfectHash to be used when creating the table." ),					new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input." ),					new FlaggedOption( "uniqueUris", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'U', "unique-uris", "Force URIs to be unique by adding random garbage at the end of duplicates; the argument is an upper bound for the number of URIs that will be read, and will be used to create a Bloom filter." ),					new UnflaggedOption( "resolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the resolver." )		});				JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;				final int bufferSize = jsapResult.getInt( "bufferSize" );		final String resolverName = jsapResult.getString( "resolver" );		//final Class<?> tableClass = jsapResult.getClass( "class" );		String termFile = jsapResult.getString( "termFile" );				BloomFilter filter = null;		final boolean uniqueURIs = jsapResult.userSpecified( "uniqueUris" ); 		if ( uniqueURIs ) filter = new BloomFilter( jsapResult.getInt( "uniqueUris" ) );				final Collection<? extends CharSequence> collection;		if ( termFile == null ) {			ArrayList<MutableString> termList = new ArrayList<MutableString>();			final ProgressLogger pl = new ProgressLogger();			pl.itemsName = "URIs";			final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl );						pl.start( "Reading URIs..." );			MutableString uri;			while( termIterator.hasNext() ) {				uri = termIterator.next();				if ( uniqueURIs ) makeUnique( filter, uri );				termList.add( uri.copy() );			}			pl.done();						collection = termList;		}		else {			if ( uniqueURIs ) {				// Create temporary file with unique URIs				final ProgressLogger pl = new ProgressLogger();				pl.itemsName = "URIs";				pl.start( "Copying URIs..." );				final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl );				File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" );				temp.deleteOnExit();				termFile = temp.toString();				final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize );				MutableString uri;				while( termIterator.hasNext() ) {					uri = termIterator.next();					makeUnique( filter, uri );					uri.writeUTF8( outputStream );					outputStream.write( '\n' );				}				pl.done();				outputStream.close();			}			collection = new FileLinesCollection( termFile, "UTF-8" );		}		LOGGER.debug( "Building minimal perfect hash table..." );		BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new LcpMonotoneMinimalPerfectHashFunction<CharSequence>( collection, TransformationStrategies.prefixFreeUtf16() ) ) ), resolverName );		LOGGER.debug( " done." );    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -