📄 zipdocumentcollectionbuilder.java
字号:
package it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.io.BinIO;import it.unimi.dsi.fastutil.objects.ObjectList;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.logging.ProgressLogger;import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;import it.unimi.dsi.mg4j.tool.Scan;import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;import it.unimi.dsi.mg4j.util.MG4JClassParser;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.ObjectOutputStream;import java.io.Reader;import java.lang.reflect.InvocationTargetException;import java.util.zip.ZipEntry;import java.util.zip.ZipOutputStream;import org.apache.log4j.Logger;import com.martiansoftware.jsap.FlaggedOption;import com.martiansoftware.jsap.JSAP;import com.martiansoftware.jsap.JSAPException;import com.martiansoftware.jsap.JSAPResult;import com.martiansoftware.jsap.Parameter;import com.martiansoftware.jsap.SimpleJSAP;import com.martiansoftware.jsap.Switch;import com.martiansoftware.jsap.UnflaggedOption;/** A builder to create {@link ZipDocumentCollection}s. * * <p>After creating an instance of this class, it is possible to add incrementally * new documents. Each document must be started with {@link #startDocument(CharSequence, CharSequence)} * and ended with {@link #endDocument()}; inside each document, each non-text field must be written by passing * an object to {@link #nonTextField(Object)}, whereas each text field must be * started with {@link #startTextField()} and ended with {@link #endTextField()}: inbetween, a call * to {@link #add(MutableString, MutableString)} must be made for each word/nonword pair retrieved * from the original collection. At the end, {@link #close()} returns a {@link it.unimi.dsi.mg4j.document.ZipDocumentCollection} * that must be serialised. * * <p>Alternatively, you can just call {@link #build(DocumentSequence)} and all the above will * be handled for you. * * <p>Each Zip entry corresponds to a document: the title is recorded in the comment field, whereas the * URI is written with {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} * directly to the zipped output stream. When building an <em>exact</em> * {@linkplain it.unimi.dsi.mg4j.document.ZipDocumentCollection} * subsequent word/nonword pairs are written in the same way, and * delimited by two empty strings. If the collection is not exact, just words are written, * and delimited by an empty string. Non-text fields are written directly to the zipped output stream. */public class ZipDocumentCollectionBuilder { private static final Logger LOGGER = Util.getLogger( ZipDocumentCollectionBuilder.class ); private static final boolean DEBUG = false; /** The output stream of the zip file. */ private ZipOutputStream zipOut; /** The number of documents written so far. */ private int numberOfDocuments; /** True iff also non-words should be reproduced. */ private boolean exact; /** The progress logger. */ private final ProgressLogger progressLogger; /** The filename of the zip file. */ private final String zipFilename; /** The factory of the base document sequence. */ private final DocumentFactory factory; /** Whether a text field has started but not yet ended. */ private boolean inTextField; /** Creates a new zipped collection builder. * * @param zipFilename the filename of the zip file. * @param factory the factory of the base document sequence. * @param exact true iff also non-words should be preserved. * @param progressLogger a progress logger. */ public ZipDocumentCollectionBuilder( final String zipFilename, final DocumentFactory factory, final boolean exact, final ProgressLogger progressLogger ) throws FileNotFoundException { this.zipFilename = zipFilename; this.factory = factory; this.zipOut = new ZipOutputStream( new FileOutputStream( zipFilename ) ); this.exact = exact; this.progressLogger = progressLogger; this.inTextField = false; } /** Starts a document entry. * * @param title the document title (usually, the result of {@link Document#title()}). * @param uri the document uri (usually, the result of {@link Document#uri()}). */ public void startDocument( final CharSequence title, final CharSequence uri ) throws IOException { final ZipEntry currEntry = new ZipEntry( Integer.toString( numberOfDocuments ) ); currEntry.setComment( title.toString() ); zipOut.putNextEntry( currEntry ); new MutableString( uri ).writeSelfDelimUTF8( zipOut ); } /** Ends a document entry. */ public void endDocument() throws IOException { zipOut.closeEntry(); numberOfDocuments++; } /** Starts a new text field. */ public void startTextField() { inTextField = true; } /** Adds a non-text field. * * @param o the content of the non-text field. */ public void nonTextField( final Object o ) throws IOException { if ( DEBUG ) LOGGER.debug( "Going to write non-text field " + o + " of class " + o.getClass() + " for document #" + numberOfDocuments ); ObjectOutputStream oos = new ObjectOutputStream( zipOut ); oos.writeObject( o ); oos.flush(); } /** Adds a virtual field. * * @param fragments the virtual fragments to be added. * */ public void virtualField( final ObjectList<VirtualDocumentFragment> fragments ) throws IOException { if ( DEBUG ) LOGGER.debug( "Going to write virtual field " + fragments + " for document #" + numberOfDocuments ); new MutableString().append( String.valueOf( fragments.size() ) ).writeSelfDelimUTF8( zipOut ); for ( VirtualDocumentFragment fragment: fragments ) { fragment.documentSpecifier().writeSelfDelimUTF8( zipOut ); fragment.text().writeSelfDelimUTF8( zipOut ); } } //This method can only be called if {@link #inTextField} is <code>true</code>, otherwise it will throw an {@link IllegalStateException}. /** Ends a new text field. */ public void endTextField() throws IOException { // Writing a 0 is like writing an empty string. if ( ! inTextField ) throw new IllegalStateException(); inTextField = false; zipOut.write( 0 ); if ( exact ) zipOut.write( 0 ); } /** Adds a word and a nonword to the current text field, provided that a text field has {@linkplain #startTextField() started} but not yet {@linkplain #endTextField() ended}; * otherwise, doesn't do anything. * * <p>Usually, <code>word</code> e <code>nonWord</code> are just the result of a call * to {@link WordReader#next(MutableString, MutableString)}. * * @param word a word. * @param nonWord a nonword. * */ public void add( final MutableString word, final MutableString nonWord ) throws IOException { if ( ! inTextField ) return; if ( DEBUG ) LOGGER.debug( "Going to write pair <" + word + "|" + nonWord + ">" ); if ( exact || word.length() > 0 ) word.writeSelfDelimUTF8( zipOut ); if ( exact ) nonWord.writeSelfDelimUTF8( zipOut ); } /** Terminates the contruction of the zipped collection and returns it. */ public ZipDocumentCollection close() throws IOException { zipOut.close(); return new ZipDocumentCollection( zipFilename, factory, numberOfDocuments, exact ); } /** A utility method copying all documents of an input sequence to a zipped collection. */ @SuppressWarnings("unchecked") public ZipDocumentCollection build( final DocumentSequence inputSequence ) throws IOException { progressLogger.start( "Zipping collection..." ); numberOfDocuments = 0; final DocumentIterator docIt = inputSequence.iterator(); if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" ); final int numberOfFields = factory.numberOfFields(); WordReader wordReader; MutableString word = new MutableString(); MutableString nonWord = new MutableString(); for (;;) { progressLogger.update(); Document document = docIt.nextDocument(); if ( document == null ) break; startDocument( document.title(), document.uri() ); for ( int field = 0; field < numberOfFields; field++ ) { Object content = document.content( field ); if ( factory.fieldType( field ) == FieldType.TEXT ) { startTextField(); wordReader = document.wordReader( field ); wordReader.setReader( (Reader)content ); while ( wordReader.next( word, nonWord ) ) add( word, nonWord ); endTextField(); } else if ( factory.fieldType( field ) == FieldType.VIRTUAL ) virtualField( (ObjectList<VirtualDocumentFragment>)content ); else nonTextField( content ); } document.close(); endDocument(); } progressLogger.done(); docIt.close(); return close(); } public static void main( final String[] arg ) throws JSAPException, IOException, ClassNotFoundException, InvocationTargetException, NoSuchMethodException, IllegalAccessException, InstantiationException { SimpleJSAP jsap = new SimpleJSAP( ZipDocumentCollectionBuilder.class.getName(), "Produces a zip document collection from an existing document sequence.", new Parameter[] { new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'S', "sequence", "A serialised document sequence that will be used instead of stdin." ), new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ), new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ), new FlaggedOption( "delimiter", JSAP.INTEGER_PARSER, Integer.toString( Scan.DEFAULT_DELIMITER ), JSAP.NOT_REQUIRED, 'd', "delimiter", "The document delimiter." ), new Switch( "approximated", 'a', "approximated", "If specified, non-words will not be copied." ), new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ), new UnflaggedOption( "collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the output document collection." ), new UnflaggedOption( "zipfile", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the output zip file." ), } ); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; DocumentSequence documentSequence = Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER ); final ProgressLogger progressLogger = new ProgressLogger( LOGGER, "documents" ); if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size(); ZipDocumentCollectionBuilder builder = new ZipDocumentCollectionBuilder( jsapResult.getString( "zipfile" ), documentSequence.factory(), !jsapResult.getBoolean( "approximated"), progressLogger ); BinIO.storeObject( builder.build( documentSequence ), jsapResult.getString( "collection" ) ); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -