📄 urlmphvirtualdocumentresolver.java
字号:
package it.unimi.dsi.mg4j.tool;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2006-2007 Paolo Boldi * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.bits.TransformationStrategies;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.FileLinesCollection;import it.unimi.dsi.io.LineIterator;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.Document;import it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction;import it.unimi.dsi.util.BloomFilter;import it.unimi.dsi.util.ShiftAddXorSignedStringMap;import it.unimi.dsi.util.StringMap;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;import java.net.URI;import java.net.URISyntaxException;import java.util.ArrayList;import java.util.Collection;import org.apache.commons.lang.RandomStringUtils;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.UnflaggedOption;/** A virtual-document resolver based on document URIs. * * <p>Instances of this class store in a {@link StringMap} instances * all URIs from a collection, and consider a virtual-document specification a (possibly relative) URI. The * virtual-document specification is resolved against the document URI, and then the perfect hash is used * to retrieve the corresponding document. * * <p>This class provides a main method that helps in building serialised resolvers from URI lists. * In case of pathological document collections with duplicate URIs (most notably, the GOV2 collection * used for TREC evaluations), an option makes it possible to add random noise to duplicates, so that * minimal perfect hash construction does not go into an infinite loop. It is a rather crude solution, but it * is nonsensical to have duplicate URIs in the first place. */public class URLMPHVirtualDocumentResolver implements VirtualDocumentResolver { private static final long serialVersionUID = 1L; private static final Logger LOGGER = Logger.getLogger( URLMPHVirtualDocumentResolver.class ); /** The term map used by this resolver to associated URI strings to numbers. */ private final StringMap<? extends CharSequence> url2DocumentPointer; /** The cached URI of the last argument to {@link #context(Document)}. */ private transient URI documentURI; public URLMPHVirtualDocumentResolver( final StringMap<? extends CharSequence> url2DocumentPointer ) { this.url2DocumentPointer = url2DocumentPointer; } public void context( final Document document ) { try { documentURI = new URI( document.uri().toString() ).normalize(); } catch ( URISyntaxException e ) { documentURI = null; } } public int resolve( final CharSequence virtualDocumentSpec ) { try { URI virtualURI = URI.create( virtualDocumentSpec.toString() ).normalize(); if ( ! virtualURI.isAbsolute() ) { if ( documentURI == null ) return -1; virtualURI = documentURI.resolve( virtualURI ); } // TODO discard opaque? return (int)url2DocumentPointer.getLong( virtualURI.toString() ); } catch ( Exception e ) { return -1; } } public int numberOfDocuments() { return url2DocumentPointer.size(); } private static void makeUnique( final BloomFilter filter, final MutableString uri ) { while( ! filter.add( uri ) ) { LOGGER.debug( "Duplicate URI " + uri ); uri.append( '/' ).append( RandomStringUtils.randomAlphanumeric( 32 ) ); } } @SuppressWarnings("unchecked") public static void main( final String[] arg ) throws JSAPException, IOException { final SimpleJSAP jsap = new SimpleJSAP( URLMPHVirtualDocumentResolver.class.getName(), "Builds a URL document resolver from a sequence of URIs, extracted typically using ScanMetadata.", new Parameter[] { new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read terms." ), //new FlaggedOption( "class", MG4JClassParser.getParser(), ShiftAddXorSignedMinimalPerfectHash.class.getName(), JSAP.NOT_REQUIRED, 'c', "class", "A subclass of MinimalPerfectHash to be used when creating the table." ), new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input." ), new FlaggedOption( "uniqueUris", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'U', "unique-uris", "Force URIs to be unique by adding random garbage at the end of duplicates; the argument is an upper bound for the number of URIs that will be read, and will be used to create a Bloom filter." ), new UnflaggedOption( "resolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the resolver." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final int bufferSize = jsapResult.getInt( "bufferSize" ); final String resolverName = jsapResult.getString( "resolver" ); //final Class<?> tableClass = jsapResult.getClass( "class" ); String termFile = jsapResult.getString( "termFile" ); BloomFilter filter = null; final boolean uniqueURIs = jsapResult.userSpecified( "uniqueUris" ); if ( uniqueURIs ) filter = new BloomFilter( jsapResult.getInt( "uniqueUris" ) ); final Collection<? extends CharSequence> collection; if ( termFile == null ) { ArrayList<MutableString> termList = new ArrayList<MutableString>(); final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "URIs"; final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl ); pl.start( "Reading URIs..." ); MutableString uri; while( termIterator.hasNext() ) { uri = termIterator.next(); if ( uniqueURIs ) makeUnique( filter, uri ); termList.add( uri.copy() ); } pl.done(); collection = termList; } else { if ( uniqueURIs ) { // Create temporary file with unique URIs final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "URIs"; pl.start( "Copying URIs..." ); final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl ); File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" ); temp.deleteOnExit(); termFile = temp.toString(); final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize ); MutableString uri; while( termIterator.hasNext() ) { uri = termIterator.next(); makeUnique( filter, uri ); uri.writeUTF8( outputStream ); outputStream.write( '\n' ); } pl.done(); outputStream.close(); } collection = new FileLinesCollection( termFile, "UTF-8" ); } LOGGER.debug( "Building minimal perfect hash table..." ); BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new LcpMonotoneMinimalPerfectHashFunction<CharSequence>( collection, TransformationStrategies.prefixFreeUtf16() ) ) ), resolverName ); LOGGER.debug( " done." ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -