📄 pdfdocumentfactory.java

📁 MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections
💻 JAVA
字号:
package it.unimi.dsi.mg4j.document;/*		  * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna  * *  This library is free software; you can redistribute it and/or modify it *  under the terms of the GNU Lesser General Public License as published by the Free *  Software Foundation; either version 2.1 of the License, or (at your option) *  any later version. * *  This library is distributed in the hope that it will be useful, but *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License *  for more details. * *  You should have received a copy of the GNU Lesser General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.util.Properties;import java.io.IOException;import java.io.InputStream;import java.io.InterruptedIOException;import java.io.ObjectInputStream;import java.io.PipedReader;import java.io.PipedWriter;import org.apache.commons.configuration.ConfigurationException;import org.pdfbox.pdmodel.PDDocument;import org.pdfbox.util.PDFTextStripper;/** A factory that converts PDF (Portable Document Format) documents into text. * Presently this class is very inefficient; it is mainly useful for debugging * and exemplification purposes.  */public class PdfDocumentFactory extends PropertyBasedDocumentFactory {	private static final long serialVersionUID = 1L;	/** Case-insensitive keys for metadata. 	 * 	 *  @see PropertyBasedDocumentFactory.MetadataKeys	 */ 	public static enum MetadataKeys {		/** A property specifying that the factory should use the first line of text as a title (not implemented). */		PARSETITLE,	} 	/** A PDF text stripper that will be used to extract text from PDF documents. */	private transient PDFTextStripper textStripper;	/** The word reader used for all documents. */	private final WordReader wordReader;	protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException {		if ( sameKey( MetadataKeys.PARSETITLE, key ) ) {			/*metadata.put( PARSE_TITLE, value );			return true;*/			throw new ConfigurationException( "PARSETITLE is not yet implemented" );		}				return super.parseProperty( key, values, metadata );	}	public PdfDocumentFactory() throws IOException {		this.textStripper= new PDFTextStripper();		this.wordReader = new FastBufferedReader();	}		public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException {		super( properties );		this.textStripper= new PDFTextStripper();		this.wordReader = new FastBufferedReader();	}	public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException {		super( defaultMetadata );		this.textStripper= new PDFTextStripper();		this.wordReader = new FastBufferedReader();	}	public PdfDocumentFactory( final String[] property ) throws IOException, ConfigurationException {		super( property );		this.textStripper= new PDFTextStripper();		this.wordReader = new FastBufferedReader();	}	public PdfDocumentFactory copy() {		try {			return new PdfDocumentFactory( defaultMetadata );		}		catch ( IOException e ) {			throw new RuntimeException( e );		}	}		public int numberOfFields() {		return 1;	}		public String fieldName( final int field ) {		ensureFieldIndex( field );		return "text";	}		public int fieldIndex( final String fieldName ) {		return "text".equals( fieldName ) ? 0: -1;	}		public FieldType fieldType( final int field ) {		ensureFieldIndex( field );		return FieldType.TEXT;	}	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {		s.defaultReadObject();		textStripper = new PDFTextStripper();	}		public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) {		return new AbstractDocument() {						private PDDocument pdfDocument;			private Thread pipingThread;			private PipedReader pipedReader;			private PipedWriter pipedWriter;						public CharSequence title() {				return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata );			}						public String toString() {				return title().toString();			}			public CharSequence uri() {				return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata );			}			public Object content( final int field ) throws IOException {				ensureFieldIndex( field );				pipedReader = new PipedReader();				pipedWriter = new PipedWriter();				pdfDocument = PDDocument.load( rawContent );				pipedWriter.connect( pipedReader );				pipingThread = new Thread() {					public void run() {						try {							textStripper.writeText( pdfDocument, pipedWriter );							pipedWriter.close();							pipedWriter = null;						}						catch( InterruptedIOException dontCare ) {}						catch ( IOException e ) {							throw new RuntimeException( e );						}					}				};				pipingThread.start();				return pipedReader;			}						public WordReader wordReader( int field ) {				ensureFieldIndex( field );				// TODO: should depend on locale or something.				return wordReader;			}			public void close() throws IOException {				super.close();				if ( pipingThread != null ) {					try {						pipingThread.interrupt();						pipingThread.join();						pipingThread = null;					}					catch ( InterruptedException e ) {						throw new RuntimeException( e );					}				}				if ( pipedReader != null ) {					pipedReader.close();					pipedReader = null;				}				if ( pipedWriter != null ) {					pipedWriter.close();					pipedWriter = null;				}				if ( pdfDocument != null ) {					pdfDocument.close();					pdfDocument = null;				}			}		};	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -