📄 zipdocumentcollectionbuilder.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.document;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.ObjectList;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;import it.unimi.dsi.mg4j.tool.Scan;import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;import it.unimi.dsi.mg4j.util.MG4JClassParser;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.ObjectOutputStream;import java.io.Reader;import java.lang.reflect.InvocationTargetException;import java.util.zip.ZipEntry;import java.util.zip.ZipOutputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** A builder to create {@link ZipDocumentCollection}s. *  * <p>After creating an instance of this class, it is possible to add incrementally * new documents. Each document must be started with {@link #startDocument(CharSequence, CharSequence)} * and ended with {@link #endDocument()}; inside each document, each non-text field must be written by passing * an object to {@link #nonTextField(Object)}, whereas each text field must be * started with {@link #startTextField()} and ended with {@link #endTextField()}: inbetween, a call * to {@link #add(MutableString, MutableString)} must be made for each word/nonword pair retrieved * from the original collection. At the end, {@link #close()} returns a {@link it.unimi.dsi.mg4j.document.ZipDocumentCollection} * that must be serialised. *  * <p>Alternatively, you can just call {@link #build(DocumentSequence)} and all the above will * be handled for you. *  * <p>Each Zip entry corresponds to a document: the title is recorded in the comment field, whereas the  * URI is written with {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} * directly to the zipped output stream. When building an <em>exact</em> * {@linkplain it.unimi.dsi.mg4j.document.ZipDocumentCollection}  * subsequent word/nonword pairs are written in the same way, and * delimited by two empty strings. If the collection is not exact, just words are written, * and delimited by an empty string. Non-text fields are written directly to the zipped output stream. */public class ZipDocumentCollectionBuilder {	private static final Logger LOGGER = Util.getLogger( ZipDocumentCollectionBuilder.class );	private static final boolean DEBUG = false;		/** The output stream of the zip file. */	private ZipOutputStream zipOut;	/** The number of documents written so far. */	private int numberOfDocuments;	/** True iff also non-words should be reproduced. */	private boolean exact;	/** The progress logger. */	private final ProgressLogger progressLogger;	/** The filename of the zip file. */	private final String zipFilename;	/** The factory of the base document sequence. */	private final DocumentFactory factory;	/** Whether a text field has started but not yet ended. */	private boolean inTextField;		/** Creates a new zipped collection builder.	 * 	 * @param zipFilename the filename of the zip file.	 * @param factory the factory of the base document sequence.	 * @param exact true iff also non-words should be preserved.	 * @param progressLogger a progress logger.	 */	public ZipDocumentCollectionBuilder( final String zipFilename, final DocumentFactory factory, final boolean exact, final ProgressLogger progressLogger ) throws FileNotFoundException {		this.zipFilename = zipFilename;		this.factory = factory;		this.zipOut = new ZipOutputStream( new FileOutputStream( zipFilename ) );		this.exact = exact;		this.progressLogger = progressLogger;		this.inTextField = false;	}		/** Starts a document entry.	 * 	 * @param title the document title (usually, the result of {@link Document#title()}).	 * @param uri the document uri (usually, the result of {@link Document#uri()}).	 */		public void startDocument( final CharSequence title, final CharSequence uri ) throws IOException {		final ZipEntry currEntry = new ZipEntry( Integer.toString( numberOfDocuments ) );		currEntry.setComment( title.toString() );		zipOut.putNextEntry( currEntry );		new MutableString( uri ).writeSelfDelimUTF8( zipOut );	}	/** Ends a document entry. 	 */		public void endDocument() throws IOException {		zipOut.closeEntry();		numberOfDocuments++;	}	/** Starts a new text field.	 */		public void startTextField() {		inTextField = true;	}	/** Adds a non-text field.	 * 	 * @param o the content of the non-text field.	 */	public void nonTextField( final Object o ) throws IOException {		if ( DEBUG ) LOGGER.debug( "Going to write non-text field " + o + " of class " + o.getClass() + " for document #" + numberOfDocuments );		ObjectOutputStream oos = new ObjectOutputStream( zipOut );		oos.writeObject( o );		oos.flush();	}		/** Adds a virtual field.	 * 	 *  @param fragments the virtual fragments to be added.	 * 	 */	public void virtualField( final ObjectList<VirtualDocumentFragment> fragments ) throws IOException {		if ( DEBUG ) LOGGER.debug( "Going to write virtual field " + fragments + " for document #" + numberOfDocuments );		new MutableString().append( String.valueOf( fragments.size() ) ).writeSelfDelimUTF8( zipOut );		for ( VirtualDocumentFragment fragment: fragments ) {			fragment.documentSpecifier().writeSelfDelimUTF8( zipOut );			fragment.text().writeSelfDelimUTF8( zipOut );		}	}		//This method can only be called if {@link #inTextField} is <code>true</code>, otherwise it will throw an {@link IllegalStateException}.	/** Ends a new text field. */	public void endTextField() throws IOException {		// Writing a 0 is like writing an empty string.		if ( ! inTextField ) throw new IllegalStateException();		inTextField = false;		zipOut.write( 0 );		if ( exact ) zipOut.write( 0 );	}	/** Adds a word and a nonword to the current text field, provided that a text field has {@linkplain #startTextField() started} but not yet {@linkplain #endTextField() ended};	 *  otherwise, doesn't do anything.	 *	 * <p>Usually, <code>word</code> e <code>nonWord</code> are just the result of a call	 * to {@link WordReader#next(MutableString, MutableString)}.	 *  	 * @param word a word.	 * @param nonWord a nonword.	 * */		public void add( final MutableString word, final MutableString nonWord ) throws IOException {		if ( ! inTextField ) return;		if ( DEBUG ) LOGGER.debug( "Going to write pair <" + word + "|" + nonWord + ">" );		if ( exact || word.length() > 0 ) word.writeSelfDelimUTF8( zipOut );		if ( exact ) nonWord.writeSelfDelimUTF8( zipOut );	}		/** Terminates the contruction of the zipped collection and returns it. */		public ZipDocumentCollection close() throws IOException {		zipOut.close();		return new ZipDocumentCollection( zipFilename, factory, numberOfDocuments, exact );	}		/** A utility method copying all documents of an input sequence to a zipped collection. */	@SuppressWarnings("unchecked")	public ZipDocumentCollection build( final DocumentSequence inputSequence ) throws IOException {		progressLogger.start( "Zipping collection..." );		numberOfDocuments = 0;		final DocumentIterator docIt = inputSequence.iterator();		if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" );		final int numberOfFields = factory.numberOfFields();		WordReader wordReader;		MutableString word = new MutableString();		MutableString nonWord = new MutableString();				for (;;) {			progressLogger.update();			Document document = docIt.nextDocument();			if ( document == null ) break;			startDocument( document.title(), document.uri() );						for ( int field = 0; field < numberOfFields; field++ ) {				Object content = document.content( field );				if ( factory.fieldType( field ) == FieldType.TEXT ) {					startTextField();					wordReader = document.wordReader( field );					wordReader.setReader( (Reader)content );					while ( wordReader.next( word, nonWord ) ) add( word, nonWord );					endTextField();				}				else if ( factory.fieldType( field ) == FieldType.VIRTUAL ) virtualField( (ObjectList<VirtualDocumentFragment>)content );				else nonTextField( content );			}			document.close();			endDocument();		}		progressLogger.done();		docIt.close();		return close();	}			public static void main( final String[] arg ) throws JSAPException, IOException, ClassNotFoundException, InvocationTargetException, NoSuchMethodException, IllegalAccessException, InstantiationException {		SimpleJSAP jsap = new SimpleJSAP( ZipDocumentCollectionBuilder.class.getName(), "Produces a zip document collection from an existing document sequence.",				new Parameter[] {					new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'S', "sequence", "A serialised document sequence that will be used instead of stdin." ),					new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ),					new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ),					new FlaggedOption( "delimiter", JSAP.INTEGER_PARSER, Integer.toString( Scan.DEFAULT_DELIMITER ), JSAP.NOT_REQUIRED, 'd', "delimiter", "The document delimiter." ),					new Switch( "approximated", 'a', "approximated", "If specified, non-words will not be copied." ),					new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),					new UnflaggedOption( "collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the output document collection." ),					new UnflaggedOption( "zipfile", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the output zip file." ),				}		);		JSAPResult jsapResult = jsap.parse( arg );		if ( jsap.messagePrinted() ) return;		DocumentSequence documentSequence = Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );		final ProgressLogger progressLogger = new ProgressLogger( LOGGER, "documents" );		if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();		ZipDocumentCollectionBuilder builder = new ZipDocumentCollectionBuilder( jsapResult.getString( "zipfile" ), documentSequence.factory(), !jsapResult.getBoolean( "approximated"), progressLogger );		BinIO.storeObject( builder.build( documentSequence ), jsapResult.getString( "collection" ) );	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -