📄 pdfdocumentfactory.java
字号:
package it.unimi.dsi.mg4j.document;/* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2007 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;import it.unimi.dsi.io.FastBufferedReader;import it.unimi.dsi.io.WordReader;import it.unimi.dsi.util.Properties;import java.io.IOException;import java.io.InputStream;import java.io.InterruptedIOException;import java.io.ObjectInputStream;import java.io.PipedReader;import java.io.PipedWriter;import org.apache.commons.configuration.ConfigurationException;import org.pdfbox.pdmodel.PDDocument;import org.pdfbox.util.PDFTextStripper;/** A factory that converts PDF (Portable Document Format) documents into text. * Presently this class is very inefficient; it is mainly useful for debugging * and exemplification purposes. */public class PdfDocumentFactory extends PropertyBasedDocumentFactory { private static final long serialVersionUID = 1L; /** Case-insensitive keys for metadata. * * @see PropertyBasedDocumentFactory.MetadataKeys */ public static enum MetadataKeys { /** A property specifying that the factory should use the first line of text as a title (not implemented). */ PARSETITLE, } /** A PDF text stripper that will be used to extract text from PDF documents. */ private transient PDFTextStripper textStripper; /** The word reader used for all documents. */ private final WordReader wordReader; protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException { if ( sameKey( MetadataKeys.PARSETITLE, key ) ) { /*metadata.put( PARSE_TITLE, value ); return true;*/ throw new ConfigurationException( "PARSETITLE is not yet implemented" ); } return super.parseProperty( key, values, metadata ); } public PdfDocumentFactory() throws IOException { this.textStripper= new PDFTextStripper(); this.wordReader = new FastBufferedReader(); } public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException { super( properties ); this.textStripper= new PDFTextStripper(); this.wordReader = new FastBufferedReader(); } public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException { super( defaultMetadata ); this.textStripper= new PDFTextStripper(); this.wordReader = new FastBufferedReader(); } public PdfDocumentFactory( final String[] property ) throws IOException, ConfigurationException { super( property ); this.textStripper= new PDFTextStripper(); this.wordReader = new FastBufferedReader(); } public PdfDocumentFactory copy() { try { return new PdfDocumentFactory( defaultMetadata ); } catch ( IOException e ) { throw new RuntimeException( e ); } } public int numberOfFields() { return 1; } public String fieldName( final int field ) { ensureFieldIndex( field ); return "text"; } public int fieldIndex( final String fieldName ) { return "text".equals( fieldName ) ? 0: -1; } public FieldType fieldType( final int field ) { ensureFieldIndex( field ); return FieldType.TEXT; } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); textStripper = new PDFTextStripper(); } public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) { return new AbstractDocument() { private PDDocument pdfDocument; private Thread pipingThread; private PipedReader pipedReader; private PipedWriter pipedWriter; public CharSequence title() { return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata ); } public String toString() { return title().toString(); } public CharSequence uri() { return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata ); } public Object content( final int field ) throws IOException { ensureFieldIndex( field ); pipedReader = new PipedReader(); pipedWriter = new PipedWriter(); pdfDocument = PDDocument.load( rawContent ); pipedWriter.connect( pipedReader ); pipingThread = new Thread() { public void run() { try { textStripper.writeText( pdfDocument, pipedWriter ); pipedWriter.close(); pipedWriter = null; } catch( InterruptedIOException dontCare ) {} catch ( IOException e ) { throw new RuntimeException( e ); } } }; pipingThread.start(); return pipedReader; } public WordReader wordReader( int field ) { ensureFieldIndex( field ); // TODO: should depend on locale or something. return wordReader; } public void close() throws IOException { super.close(); if ( pipingThread != null ) { try { pipingThread.interrupt(); pipingThread.join(); pipingThread = null; } catch ( InterruptedException e ) { throw new RuntimeException( e ); } } if ( pipedReader != null ) { pipedReader.close(); pipedReader = null; } if ( pipedWriter != null ) { pipedWriter.close(); pipedWriter = null; } if ( pdfDocument != null ) { pdfDocument.close(); pdfDocument = null; } } }; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -