📄 zipdocumentcollection.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.document;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi   * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;import java.io.EOFException;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.io.ObjectInputStream;import java.io.Reader;import java.io.Serializable;import java.util.NoSuchElementException;import java.util.zip.ZipEntry;import java.util.zip.ZipFile;import java.util.zip.ZipInputStream;import org.apache.log4j.Logger;/** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} produced from a document * sequence using {@link it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder}. *  * <p>The collection will produce the same documents as the original sequence whence it * was produced, in the following sense: *  *  <ul> *    <li>the resulting collection has as many document as the original sequence, in the same order, with *     the same titles and URI; *    <li>every document has the same number of fields, with the same names and types; *    <li>non-textual non-virtual fields will be written out as objects, so they need to be serializable; *    <li>virtual fields will be written as a sequence of {@linkplain MutableString#writeSelfDelimUTF8(java.io.DataOutput) self-delimiting UTF-8 mutable strings} *     starting with the number of fragments (converted into a string with {@link String#valueOf(int)}), *     followed by a pair of strings for each fragment (the first string being the document specifier, *     and the second being the associated text); *    <li>textual fields will be written out in such a way that, when reading them, the same sequence *     of words and non-words will be produced; alternatively, one may produce a collection that only *     copies words (non-words are not copied).  *  </ul> *   * <p><strong>Warning:</strong> the {@link java.io.Reader} returned by {@link it.unimi.dsi.mg4j.document.Document#content(int)} * for documents produced by this factory is just obtained as the concatenation of words and non-words returned by * the word reader for that field. *  * <p>The collection will be, as any other collection, serialized on a file, but it will refer to another * zip file that is going to contain the documents themselves. */public class ZipDocumentCollection extends AbstractDocumentCollection implements Serializable {	private static final long serialVersionUID = 1L;	private static final Logger LOGGER = Util.getLogger( ZipDocumentCollection.class );	private static final boolean DEBUG = false;		/** The name of the zip collection file. */	private final String zipFilename;	/** The zip collection file. */	private transient ZipFile zipFile;	/** The factory used for the original document sequence. */	private final DocumentFactory underlyingFactory;	/** The factory used for this document collection. */	private transient DocumentFactory factory;	/** The number of documents. */	private final int numberOfDocuments;	/** <code>true</code> iff this is an exact reproduction of the original sequence (i.e., if also non-words are preserved). */	private final boolean exact;	/** A factory tightly coupled to a {@link ZipDocumentCollection}. */	protected static class ZipFactory extends AbstractDocumentFactory {		private static final long serialVersionUID = 1L;		private final boolean exact;		private final DocumentFactory underlyingFactory;		protected ZipFactory( final boolean exact, final DocumentFactory underlyingFactory ) {			this.exact = exact;			this.underlyingFactory = underlyingFactory;		}		public ZipFactory copy() {			return this;		}				public int numberOfFields() {			return underlyingFactory.numberOfFields();		}		public String fieldName( final int field ) {			ensureFieldIndex( field );			return underlyingFactory.fieldName( field );		}		public int fieldIndex( final String fieldName ) {			return underlyingFactory.fieldIndex( fieldName );		}		public FieldType fieldType( final int field ) {			ensureFieldIndex( field );			return underlyingFactory.fieldType( field );		}		public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException {			return new AbstractDocument() {				int nextFieldToRead = 0;				final MutableString uri = new MutableString();								{					uri.readSelfDelimUTF8( rawContent ).compact();				}								public CharSequence title() {					return (CharSequence)metadata.get( MetadataKeys.TITLE );				}								public String toString() {					return title().toString();				}				public CharSequence uri() {					return uri;				}								/** Skips until the end of the current field, and increments <code>nextFieldToRead</code>.				 * @throws ClassNotFoundException				 * @throws IOException				 */				private void skipOneField() throws IOException, ClassNotFoundException {					switch( fieldType( nextFieldToRead ) ) {					case TEXT:						MutableString word = new MutableString();						MutableString nonWord = new MutableString();						do {							word.readSelfDelimUTF8( rawContent );							if ( exact ) nonWord.readSelfDelimUTF8( rawContent );						} while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) );						break;					case VIRTUAL: 						MutableString dummy = new MutableString();						int nfrag = Integer.parseInt( dummy.readSelfDelimUTF8( rawContent ).toString() );						for ( int i = 0; i < 2 * nfrag; i++ ) dummy.readSelfDelimUTF8( rawContent );						break;					default: // Non-text and non-virtual						new ObjectInputStream( rawContent ).readObject();					}					nextFieldToRead++;				}								/** Skips to the given field.				 * 				 * @param field the field to skip to.				 * @throws IOException				 * @throws ClassNotFoundException				 */				private void skipToField( final int field ) throws IOException, ClassNotFoundException {					if ( nextFieldToRead > field ) throw new IllegalStateException( "Trying to skip to field " + field + " after " + nextFieldToRead );					while ( nextFieldToRead < field ) skipOneField();				}				public Object content( final int field ) {					ensureFieldIndex( field );					Object result = null;					if ( DEBUG ) LOGGER.debug( "Called content(" + field + "); nextField:" + nextFieldToRead );					try {						skipToField( field );						if ( fieldType( nextFieldToRead ) == FieldType.VIRTUAL ) {							int nfrag = Integer.parseInt( new MutableString().readSelfDelimUTF8( rawContent ).toString() );							MutableString doc = new MutableString();							MutableString text = new MutableString();							VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[ nfrag ];							for ( int i = 0; i < nfrag; i++ ) {								doc.readSelfDelimUTF8( rawContent );								text.readSelfDelimUTF8( rawContent );								fragArray[ i ] = new AnchorExtractor.Anchor( doc.copy(), text.copy() );							}							result = new ObjectArrayList<VirtualDocumentFragment>( fragArray );						}						else if ( fieldType( nextFieldToRead ) != FieldType.TEXT ) {							result = new ObjectInputStream( rawContent ).readObject();							if ( DEBUG ) LOGGER.debug( "Read " + result + " from field " + fieldName( nextFieldToRead ) + " of object " + title() );							nextFieldToRead++;						}						else {							if ( DEBUG ) LOGGER.debug( "Returning reader for " + field );							result = new Reader() {								FastBufferedReader fbr = null;								int f = field;								public void close() {}								public int read( final char[] cbuf, final int off, final int len ) throws IOException {									if ( fbr == null ) {										if ( DEBUG ) LOGGER.debug( "Initialising reader for content " + f );										MutableString text = new MutableString();										MutableString word = new MutableString();										MutableString nonWord = new MutableString(); 										do {											text.append( word.readSelfDelimUTF8( rawContent ) );											if ( exact ) text.append( nonWord.readSelfDelimUTF8( rawContent ) );										} while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) );										fbr = new FastBufferedReader( text );										nextFieldToRead++;									}									return fbr.read( cbuf, off, len );								}							};						}					} catch ( IOException e ) {						throw new RuntimeException( e );					} catch (ClassNotFoundException e) {						throw new RuntimeException( e );					} 					return result;				}				public WordReader wordReader( final int field )  {					ensureFieldIndex( field );					if ( DEBUG ) LOGGER.debug( "Called wordReader(" + field + ")" );					try {						skipToField( field );					} catch ( Exception e ) {						throw new RuntimeException( e );					} 					//logger.debug( "Asked for a new word reader for field " + fieldName( field ) );					switch ( fieldType( field ) ) {					case TEXT:						return new WordReader() {							private static final long serialVersionUID = 1L;							public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {								try {									word.readSelfDelimUTF8( rawContent );								}								catch( EOFException e ) {									return false; // TODO: a bit raw								}								nonWord.length( 0 );																if ( exact ) {									try {										nonWord.readSelfDelimUTF8( rawContent );									}									catch( EOFException e ) {										return true; // TODO: a bit raw									}								}								final boolean goOn = word.length() != 0 || ( exact && nonWord.length() != 0 );								if ( DEBUG ) LOGGER.debug( "Got word <" + word + "|" + nonWord + "> exact=" + exact + " returning " + goOn );								if ( ! goOn ) nextFieldToRead++;								return goOn;							}							public WordReader setReader( final Reader reader ) {								return this;							}							public WordReader copy() {								throw new UnsupportedOperationException();							}						};					case VIRTUAL:						return new FastBufferedReader();					default:						return null;					}				}			};		}	}		/** Constructs a document collection (for reading) corresponding to a given zip collection file.	 * 	 * @param zipFilename the filename of the zip collection.	 * @param underlyingFactory the underlying document factory.	 * @param numberOfDocuments the number of documents.	 * @param exact <code>true</code> iff this is an exact reproduction of the original sequence.	 * @throws IOException	 */	public ZipDocumentCollection( final String zipFilename, final DocumentFactory underlyingFactory, final int numberOfDocuments, final boolean exact ) throws IOException  {		this.zipFilename = zipFilename;		this.underlyingFactory = underlyingFactory;		this.numberOfDocuments = numberOfDocuments;		this.exact = exact;		zipFile = new ZipFile( new File( zipFilename ) );		// Creates the factory		factory = new ZipFactory( exact, underlyingFactory );	}	public ZipDocumentCollection copy() {		try {			return new ZipDocumentCollection( zipFilename, underlyingFactory, numberOfDocuments, exact );		}		catch ( IOException e ) {			throw new RuntimeException( e );		}	}		private Object readResolve() throws IOException {		super.close();		return new ZipDocumentCollection( zipFilename, underlyingFactory, numberOfDocuments, exact );	}		public DocumentFactory factory() {		return factory;	}		public int size() {		return numberOfDocuments;	}	private ZipEntry getEntry( final int index ) {		ensureDocumentIndex( index );		final ZipEntry entry = zipFile.getEntry( Integer.toString( index ) );		if ( entry == null ) throw new NoSuchElementException( "Failure retrieving entry " + index );		return entry;	}		public Document document( final int index ) throws IOException {		final ZipEntry entry = getEntry( index );		final Reference2ObjectMap<Enum<?>,Object> metadata = metadata( index, entry );		InputStream is = zipFile.getInputStream( entry );		return factory.getDocument( is, metadata );	}		private Reference2ObjectMap<Enum<?>,Object> metadata( final int index, ZipEntry entry ) {		if ( entry == null ) entry = getEntry( index );		final Reference2ObjectArrayMap<Enum<?>,Object> metadata = new Reference2ObjectArrayMap<Enum<?>,Object>( 1 );		metadata.put( MetadataKeys.TITLE, entry.getComment() );		return metadata;	}		public Reference2ObjectMap<Enum<?>,Object> metadata( final int index ) {		return metadata( index, null );	}		public InputStream stream( final int index ) throws IOException {		final ZipEntry entry = zipFile.getEntry( Integer.toString( index ) );		entry.getComment(); // Just skip title		InputStream is = zipFile.getInputStream( entry );		return is;	}		public DocumentIterator iterator() {			try {				return new AbstractDocumentIterator() {					final Reference2ObjectArrayMap<Enum<?>,Object> metadata = new Reference2ObjectArrayMap<Enum<?>,Object>( new Enum[ 1 ], new Object[ 1 ] );					ZipInputStream zis = new ZipInputStream( new FileInputStream( zipFile.getName() ) ); 					public Document nextDocument() throws IOException {						ZipEntry entry;						String name;						do {							entry = zis.getNextEntry();							if ( entry == null ) return null;							name = entry.getName();						} while ( !Character.isDigit( name.charAt( 0 ) ) );  						if ( entry == null ) return null;						String title = entry.getComment();						if ( DEBUG ) LOGGER.debug( "Reading sequentially document " + title + ", name: " + entry.getName() );						InputStream is = zipFile.getInputStream( entry );						metadata.put( MetadataKeys.TITLE, title );						return factory.getDocument( is, metadata );					}				};			} catch ( FileNotFoundException e ) {				throw new RuntimeException( e );			}	}		public void close() throws IOException {		super.close();		zipFile.close();	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -