📄 zipdocumentcollection.java
字号:
package it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Paolo Boldi * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.Util;import it.unimi.dsi.fastutil.objects.ObjectArrayList;import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.lang.MutableString;import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;import java.io.EOFException;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.io.ObjectInputStream;import java.io.Reader;import java.io.Serializable;import java.util.NoSuchElementException;import java.util.zip.ZipEntry;import java.util.zip.ZipFile;import java.util.zip.ZipInputStream;import org.apache.log4j.Logger;/** A {@link it.unimi.dsi.mg4j.document.DocumentCollection} produced from a document * sequence using {@link it.unimi.dsi.mg4j.document.ZipDocumentCollectionBuilder}. * * <p>The collection will produce the same documents as the original sequence whence it * was produced, in the following sense: * * <ul> * <li>the resulting collection has as many document as the original sequence, in the same order, with * the same titles and URI; * <li>every document has the same number of fields, with the same names and types; * <li>non-textual non-virtual fields will be written out as objects, so they need to be serializable; * <li>virtual fields will be written as a sequence of {@linkplain MutableString#writeSelfDelimUTF8(java.io.DataOutput) self-delimiting UTF-8 mutable strings} * starting with the number of fragments (converted into a string with {@link String#valueOf(int)}), * followed by a pair of strings for each fragment (the first string being the document specifier, * and the second being the associated text); * <li>textual fields will be written out in such a way that, when reading them, the same sequence * of words and non-words will be produced; alternatively, one may produce a collection that only * copies words (non-words are not copied). * </ul> * * <p><strong>Warning:</strong> the {@link java.io.Reader} returned by {@link it.unimi.dsi.mg4j.document.Document#content(int)} * for documents produced by this factory is just obtained as the concatenation of words and non-words returned by * the word reader for that field. * * <p>The collection will be, as any other collection, serialized on a file, but it will refer to another * zip file that is going to contain the documents themselves. */public class ZipDocumentCollection extends AbstractDocumentCollection implements Serializable { private static final long serialVersionUID = 1L; private static final Logger LOGGER = Util.getLogger( ZipDocumentCollection.class ); private static final boolean DEBUG = false; /** The name of the zip collection file. */ private final String zipFilename; /** The zip collection file. */ private transient ZipFile zipFile; /** The factory used for the original document sequence. */ private final DocumentFactory underlyingFactory; /** The factory used for this document collection. */ private transient DocumentFactory factory; /** The number of documents. */ private final int numberOfDocuments; /** <code>true</code> iff this is an exact reproduction of the original sequence (i.e., if also non-words are preserved). */ private final boolean exact; /** A factory tightly coupled to a {@link ZipDocumentCollection}. */ protected static class ZipFactory extends AbstractDocumentFactory { private static final long serialVersionUID = 1L; private final boolean exact; private final DocumentFactory underlyingFactory; protected ZipFactory( final boolean exact, final DocumentFactory underlyingFactory ) { this.exact = exact; this.underlyingFactory = underlyingFactory; } public ZipFactory copy() { return this; } public int numberOfFields() { return underlyingFactory.numberOfFields(); } public String fieldName( final int field ) { ensureFieldIndex( field ); return underlyingFactory.fieldName( field ); } public int fieldIndex( final String fieldName ) { return underlyingFactory.fieldIndex( fieldName ); } public FieldType fieldType( final int field ) { ensureFieldIndex( field ); return underlyingFactory.fieldType( field ); } public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException { return new AbstractDocument() { int nextFieldToRead = 0; final MutableString uri = new MutableString(); { uri.readSelfDelimUTF8( rawContent ).compact(); } public CharSequence title() { return (CharSequence)metadata.get( MetadataKeys.TITLE ); } public String toString() { return title().toString(); } public CharSequence uri() { return uri; } /** Skips until the end of the current field, and increments <code>nextFieldToRead</code>. * @throws ClassNotFoundException * @throws IOException */ private void skipOneField() throws IOException, ClassNotFoundException { switch( fieldType( nextFieldToRead ) ) { case TEXT: MutableString word = new MutableString(); MutableString nonWord = new MutableString(); do { word.readSelfDelimUTF8( rawContent ); if ( exact ) nonWord.readSelfDelimUTF8( rawContent ); } while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) ); break; case VIRTUAL: MutableString dummy = new MutableString(); int nfrag = Integer.parseInt( dummy.readSelfDelimUTF8( rawContent ).toString() ); for ( int i = 0; i < 2 * nfrag; i++ ) dummy.readSelfDelimUTF8( rawContent ); break; default: // Non-text and non-virtual new ObjectInputStream( rawContent ).readObject(); } nextFieldToRead++; } /** Skips to the given field. * * @param field the field to skip to. * @throws IOException * @throws ClassNotFoundException */ private void skipToField( final int field ) throws IOException, ClassNotFoundException { if ( nextFieldToRead > field ) throw new IllegalStateException( "Trying to skip to field " + field + " after " + nextFieldToRead ); while ( nextFieldToRead < field ) skipOneField(); } public Object content( final int field ) { ensureFieldIndex( field ); Object result = null; if ( DEBUG ) LOGGER.debug( "Called content(" + field + "); nextField:" + nextFieldToRead ); try { skipToField( field ); if ( fieldType( nextFieldToRead ) == FieldType.VIRTUAL ) { int nfrag = Integer.parseInt( new MutableString().readSelfDelimUTF8( rawContent ).toString() ); MutableString doc = new MutableString(); MutableString text = new MutableString(); VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[ nfrag ]; for ( int i = 0; i < nfrag; i++ ) { doc.readSelfDelimUTF8( rawContent ); text.readSelfDelimUTF8( rawContent ); fragArray[ i ] = new AnchorExtractor.Anchor( doc.copy(), text.copy() ); } result = new ObjectArrayList<VirtualDocumentFragment>( fragArray ); } else if ( fieldType( nextFieldToRead ) != FieldType.TEXT ) { result = new ObjectInputStream( rawContent ).readObject(); if ( DEBUG ) LOGGER.debug( "Read " + result + " from field " + fieldName( nextFieldToRead ) + " of object " + title() ); nextFieldToRead++; } else { if ( DEBUG ) LOGGER.debug( "Returning reader for " + field ); result = new Reader() { FastBufferedReader fbr = null; int f = field; public void close() {} public int read( final char[] cbuf, final int off, final int len ) throws IOException { if ( fbr == null ) { if ( DEBUG ) LOGGER.debug( "Initialising reader for content " + f ); MutableString text = new MutableString(); MutableString word = new MutableString(); MutableString nonWord = new MutableString(); do { text.append( word.readSelfDelimUTF8( rawContent ) ); if ( exact ) text.append( nonWord.readSelfDelimUTF8( rawContent ) ); } while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) ); fbr = new FastBufferedReader( text ); nextFieldToRead++; } return fbr.read( cbuf, off, len ); } }; } } catch ( IOException e ) { throw new RuntimeException( e ); } catch (ClassNotFoundException e) { throw new RuntimeException( e ); } return result; } public WordReader wordReader( final int field ) { ensureFieldIndex( field ); if ( DEBUG ) LOGGER.debug( "Called wordReader(" + field + ")" ); try { skipToField( field ); } catch ( Exception e ) { throw new RuntimeException( e ); } //logger.debug( "Asked for a new word reader for field " + fieldName( field ) ); switch ( fieldType( field ) ) { case TEXT: return new WordReader() { private static final long serialVersionUID = 1L; public boolean next( final MutableString word, final MutableString nonWord ) throws IOException { try { word.readSelfDelimUTF8( rawContent ); } catch( EOFException e ) { return false; // TODO: a bit raw } nonWord.length( 0 ); if ( exact ) { try { nonWord.readSelfDelimUTF8( rawContent ); } catch( EOFException e ) { return true; // TODO: a bit raw } } final boolean goOn = word.length() != 0 || ( exact && nonWord.length() != 0 ); if ( DEBUG ) LOGGER.debug( "Got word <" + word + "|" + nonWord + "> exact=" + exact + " returning " + goOn ); if ( ! goOn ) nextFieldToRead++; return goOn; } public WordReader setReader( final Reader reader ) { return this; } public WordReader copy() { throw new UnsupportedOperationException(); } }; case VIRTUAL: return new FastBufferedReader(); default: return null; } } }; } } /** Constructs a document collection (for reading) corresponding to a given zip collection file. * * @param zipFilename the filename of the zip collection. * @param underlyingFactory the underlying document factory. * @param numberOfDocuments the number of documents. * @param exact <code>true</code> iff this is an exact reproduction of the original sequence. * @throws IOException */ public ZipDocumentCollection( final String zipFilename, final DocumentFactory underlyingFactory, final int numberOfDocuments, final boolean exact ) throws IOException { this.zipFilename = zipFilename; this.underlyingFactory = underlyingFactory; this.numberOfDocuments = numberOfDocuments; this.exact = exact; zipFile = new ZipFile( new File( zipFilename ) ); // Creates the factory factory = new ZipFactory( exact, underlyingFactory ); } public ZipDocumentCollection copy() { try { return new ZipDocumentCollection( zipFilename, underlyingFactory, numberOfDocuments, exact ); } catch ( IOException e ) { throw new RuntimeException( e ); } } private Object readResolve() throws IOException { super.close(); return new ZipDocumentCollection( zipFilename, underlyingFactory, numberOfDocuments, exact ); } public DocumentFactory factory() { return factory; } public int size() { return numberOfDocuments; } private ZipEntry getEntry( final int index ) { ensureDocumentIndex( index ); final ZipEntry entry = zipFile.getEntry( Integer.toString( index ) ); if ( entry == null ) throw new NoSuchElementException( "Failure retrieving entry " + index ); return entry; } public Document document( final int index ) throws IOException { final ZipEntry entry = getEntry( index ); final Reference2ObjectMap<Enum<?>,Object> metadata = metadata( index, entry ); InputStream is = zipFile.getInputStream( entry ); return factory.getDocument( is, metadata ); } private Reference2ObjectMap<Enum<?>,Object> metadata( final int index, ZipEntry entry ) { if ( entry == null ) entry = getEntry( index ); final Reference2ObjectArrayMap<Enum<?>,Object> metadata = new Reference2ObjectArrayMap<Enum<?>,Object>( 1 ); metadata.put( MetadataKeys.TITLE, entry.getComment() ); return metadata; } public Reference2ObjectMap<Enum<?>,Object> metadata( final int index ) { return metadata( index, null ); } public InputStream stream( final int index ) throws IOException { final ZipEntry entry = zipFile.getEntry( Integer.toString( index ) ); entry.getComment(); // Just skip title InputStream is = zipFile.getInputStream( entry ); return is; } public DocumentIterator iterator() { try { return new AbstractDocumentIterator() { final Reference2ObjectArrayMap<Enum<?>,Object> metadata = new Reference2ObjectArrayMap<Enum<?>,Object>( new Enum[ 1 ], new Object[ 1 ] ); ZipInputStream zis = new ZipInputStream( new FileInputStream( zipFile.getName() ) ); public Document nextDocument() throws IOException { ZipEntry entry; String name; do { entry = zis.getNextEntry(); if ( entry == null ) return null; name = entry.getName(); } while ( !Character.isDigit( name.charAt( 0 ) ) ); if ( entry == null ) return null; String title = entry.getComment(); if ( DEBUG ) LOGGER.debug( "Reading sequentially document " + title + ", name: " + entry.getName() ); InputStream is = zipFile.getInputStream( entry ); metadata.put( MetadataKeys.TITLE, title ); return factory.getDocument( is, metadata ); } }; } catch ( FileNotFoundException e ) { throw new RuntimeException( e ); } } public void close() throws IOException { super.close(); zipFile.close(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -