📄 pdfparser.java

📁 非常有用的操作pdf文件的java源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/**
 * Copyright (c) 2003-2006, www.pdfbox.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of pdfbox; nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * http://www.pdfbox.org
 *
 */
package org.pdfbox.pdfparser;

import java.io.File;
import java.io.InputStream;
import java.io.IOException;

import java.util.Iterator;

import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSObject;
import org.pdfbox.cos.COSStream;
import org.pdfbox.exceptions.WrappedIOException;
import org.pdfbox.io.RandomAccess;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.pdmodel.fdf.FDFDocument;

import org.pdfbox.persistence.util.COSObjectKey;

/**
 * This class will handle the parsing of the PDF document.
 *
 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
 * @version $Revision: 1.53 $
 */
public class PDFParser extends BaseParser
{
    private static final int SPACE_BYTE = 32;

    private static final String PDF_HEADER = "%PDF-";
    private COSDocument document;

    /**
     * Temp file directory.
     */
    private File tempDirectory = null;

    private RandomAccess raf = null;

    /**
     * Constructor.
     *
     * @param input The input stream that contains the PDF document.
     *
     * @throws IOException If there is an error initializing the stream.
     */
    public PDFParser( InputStream input ) throws IOException
    {
        this(input, null);
    }

    /**
     * Constructor to allow control over RandomAccessFile.
     * @param input The input stream that contains the PDF document.
     * @param rafi The RandomAccessFile to be used in internal COSDocument
     *
     * @throws IOException If there is an error initializing the stream.
     */
    public PDFParser(InputStream input, RandomAccess rafi)
        throws IOException
    {
        super(input);
        this.raf = rafi;
    }

    /**
     * This is the directory where pdfbox will create a temporary file
     * for storing pdf document stream in.  By default this directory will
     * be the value of the system property java.io.tmpdir.
     *
     * @param tmpDir The directory to create scratch files needed to store
     *        pdf document streams.
     */
    public void setTempDirectory( File tmpDir )
    {
        tempDirectory = tmpDir;
    }

    /**
     * This will prase the stream and create the PDF document.  This will close
     * the stream when it is done parsing.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    public void parse() throws IOException
    {
        try
        {
            if ( raf == null )
            {
                if( tempDirectory != null )
                {
                    document = new COSDocument( tempDirectory );
                }
                else
                {
                    document = new COSDocument();
                }
            }
            else
            {
                document = new COSDocument( raf );
            }
            setDocument( document );
            String header = readLine();
            document.setHeaderString( header );

            if( header.length() < PDF_HEADER.length()+1 )
            {
                throw new IOException( "Error: Header is corrupt '" + header + "'" );
            }

            //sometimes there are some garbage bytes in the header before the header
            //actually starts, so lets try to find the header first.
            int headerStart = header.indexOf( PDF_HEADER );

            //greater than zero because if it is zero then
            //there is no point of trimming
            if( headerStart > 0 )
            {
                //trim off any leading characters
                header = header.substring( headerStart, header.length() );
            }

            try
            {
                float pdfVersion = Float.parseFloat( 
                    header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
                document.setVersion( pdfVersion );
            }
            catch( NumberFormatException e )
            {
                throw new IOException( "Error getting pdf version:" + e );
            }

            skipHeaderFillBytes();


            Object nextObject;
            boolean wasLastParsedObjectAnXref = false;
            try
            {
                while( (nextObject = parseObject()) != null )
                {
                    if( nextObject instanceof PDFXref )
                    {
                        PDFXref xref = (PDFXref)nextObject;
                        addXref(xref);
                        wasLastParsedObjectAnXref = true;
                    }
                    else
                    {
                        wasLastParsedObjectAnXref = false;
                    }
                    skipSpaces();
                }
                if( document.getTrailer() == null )
                {
                    COSDictionary trailer = new COSDictionary();
                    Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator();
                    while( xrefIter.hasNext() )
                    {
                        COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
                        trailer.addAll( next );
                    }
                    document.setTrailer( trailer );
                }
                if( !document.isEncrypted() )
                {
                    document.dereferenceObjectStreams();
                }
            }
            catch( IOException e )
            {
                if( wasLastParsedObjectAnXref )
                {
                    //Then we assume that there is just random garbage after
                    //the xref, not sure why the PDF spec allows this but it does.
                }
                else
                {
                    //some other error so just pass it along
                    throw e;
                }
            }
        }
        catch( Throwable t )
        {
            //so if the PDF is corrupt then close the document and clear
            //all resources to it
            if( document != null )
            {
                document.close();
            }
            if( t instanceof IOException )
            {
                throw (IOException)t;
            }
            else
            {
                throw new WrappedIOException( t );
            }
        }
        finally
        {
            pdfSource.close();
        }
    }

    /**
     * This will skip a header's binary fill bytes.  This is in accordance to
     * PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
     *
     * @throws IOException If there is an error reading from the stream.
    */
    protected void skipHeaderFillBytes() throws IOException
    {
        skipSpaces();
        int c = pdfSource.peek();
        
        if( !Character.isDigit( (char)c ) )
        {
            // Fill bytes conform with PDF reference (but without comment sign)
            // => skip until EOL
            readLine();
        }
        // else: no fill bytes
    }

    /**
     * This will get the document that was parsed.  parse() must be called before this is called.
     * When you are done with this document you must call close() on it to release
     * resources.
     *
     * @return The document that was parsed.
     *
     * @throws IOException If there is an error getting the document.
     */
    public COSDocument getDocument() throws IOException
    {
        if( document == null )
        {
            throw new IOException( "You must call parse() before calling getDocument()" );
        }
        return document;
    }

    /**
     * This will get the PD document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    public PDDocument getPDDocument() throws IOException
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -