📄 baseparser.java

📁 非常有用的操作pdf文件的java源码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/**
 * Copyright (c) 2003-2006, www.pdfbox.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of pdfbox; nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * http://www.pdfbox.org
 *
 */
package org.pdfbox.pdfparser;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;

import java.util.ArrayList;
import java.util.List;

import org.pdfbox.io.ByteArrayPushBackInputStream;
import org.pdfbox.io.PushBackInputStream;
import org.pdfbox.io.RandomAccess;

import org.pdfbox.cos.COSArray;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSBoolean;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSInteger;
import org.pdfbox.cos.COSName;
import org.pdfbox.cos.COSNull;
import org.pdfbox.cos.COSNumber;
import org.pdfbox.cos.COSObject;
import org.pdfbox.cos.COSStream;
import org.pdfbox.cos.COSString;

import org.pdfbox.persistence.util.COSObjectKey;

/**
 * This class is used to contain parsing logic that will be used by both the
 * PDFParser and the COSStreamParser.
 *
 * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
 * @version $Revision: 1.59 $
 */
public abstract class BaseParser
{
    /**
     * This is a byte array that will be used for comparisons.
     */
    public static final byte[] ENDSTREAM = 
        new byte[] {101,110,100,115,116,114,101,97,109};//"endstream".getBytes( "ISO-8859-1" );

    /**
     * This is a byte array that will be used for comparisons.
     */
    public static final String DEF = "def";

    /**
     * This is the stream that will be read from.
     */
    //protected PushBackByteArrayStream pdfSource;
    protected PushBackInputStream pdfSource;

    /**
     * moved xref here, is a persistence construct
     * maybe not needed anyway when not read from behind with delayed
     * access to objects.
     */
    private List xrefs = new ArrayList();

    private COSDocument document;

    /**
     * Constructor.
     *
     * @param input The input stream to read the data from.
     * 
     * @throws IOException If there is an error reading the input stream.
     */
    public BaseParser( InputStream input) throws IOException
    {
        //pdfSource = new PushBackByteArrayStream( input );
        pdfSource = new PushBackInputStream( new BufferedInputStream( input, 16384 ), 4096 );
    }
    
    /**
     * Constructor.
     *
     * @param input The array to read the data from.
     * 
     * @throws IOException If there is an error reading the byte data.
     */
    protected BaseParser(byte[] input) throws IOException
    {
        pdfSource = new ByteArrayPushBackInputStream(input);
    }
    
    /**
     * Set the document for this stream.
     * 
     * @param doc The current document.
     */
    public void setDocument( COSDocument doc )
    {
        document = doc;
    }

    private static boolean isHexDigit(char ch)
    {
        return (ch >= '0' && ch <= '9') || 
        (ch >= 'a' && ch <= 'f') || 
        (ch >= 'A' && ch <= 'F');
        // the line below can lead to problems with certain versions of the IBM JIT compiler
        // (and is slower anyway)
        //return (HEXDIGITS.indexOf(ch) != -1);
    }

    /**
     * This will parse a PDF dictionary value.
     *
     * @return The parsed Dictionary object.
     *
     * @throws IOException If there is an error parsing the dictionary object.
     */
    private COSBase parseCOSDictionaryValue() throws IOException
    {
        COSBase retval = null;
        COSBase number = parseDirObject();
        skipSpaces();
        char next = (char)pdfSource.peek();
        if( next >= '0' && next <= '9' )
        {
            COSBase generationNumber = parseDirObject();
            skipSpaces();
            char r = (char)pdfSource.read();
            if( r != 'R' )
            {
                throw new IOException( "expected='R' actual='" + r + "' " + pdfSource );
            }
            COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
                                                ((COSInteger) generationNumber).intValue());
            retval = document.getObjectFromPool(key);
        }
        else
        {
            retval = number;
        }
        return retval;
    }

    /**
     * This will parse a PDF dictionary.
     *
     * @return The parsed dictionary.
     *
     * @throws IOException IF there is an error reading the stream.
     */
    protected COSDictionary parseCOSDictionary() throws IOException
    {
        char c = (char)pdfSource.read();
        if( c != '<')
        {
            throw new IOException( "expected='<' actual='" + c + "'" );
        }
        c = (char)pdfSource.read();
        if( c != '<')
        {
            throw new IOException( "expected='<' actual='" + c + "' " + pdfSource );
        }
        skipSpaces();
        COSDictionary obj = new COSDictionary();
        boolean done = false;
        while( !done )
        {
            skipSpaces();
            c = (char)pdfSource.peek();
            if( c == '>')
            {
                done = true;
            }
            else
            {
                COSName key = parseCOSName();
                COSBase value = parseCOSDictionaryValue();
                skipSpaces();
                if( ((char)pdfSource.peek()) == 'd' )
                {
                    //if the next string is 'def' then we are parsing a cmap stream
                    //and want to ignore it, otherwise throw an exception.
                    String potentialDEF = readString();
                    if( !potentialDEF.equals( DEF ) )
                    {
                        pdfSource.unread( potentialDEF.getBytes() );
                    }
                    else
                    {
                        skipSpaces();
                    }
                }

                if( value == null )
                {
                    throw new IOException("Bad Dictionary Declaration " + pdfSource );
                }
                obj.setItem( key, value );
            }
        }
        char ch = (char)pdfSource.read();
        if( ch != '>' )
        {
            throw new IOException( "expected='>' actual='" + ch + "'" );
        }
        ch = (char)pdfSource.read();
        if( ch != '>' )
        {
            throw new IOException( "expected='>' actual='" + ch + "'" );
        }
        return obj;
    }

    /**
     * This will read a COSStream from the input stream.
     *
     * @param file The file to write the stream to when reading.
     * @param dic The dictionary that goes with this stream.
     *
     * @return The parsed pdf stream.
     *
     * @throws IOException If there is an error reading the stream.
     */
    protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException
    {
        COSStream stream = new COSStream( dic, file );
        OutputStream out = null;
        try
        {
            String streamString = readString();
            //long streamLength;

            if (!streamString.equals("stream"))
            {
                throw new IOException("expected='stream' actual='" + streamString + "'");
            }

            //PDF Ref 3.2.7 A stream must be followed by either
            //a CRLF or LF but nothing else.

            int whitespace = pdfSource.read();
            
            //see brother_scan_cover.pdf, it adds whitespaces
            //after the stream but before the start of the 
            //data, so just read those first
            while (whitespace == 0x20)
            {
                whitespace = pdfSource.read();
            }

            if( whitespace == 0x0D )
            {
                whitespace = pdfSource.read();
                if( whitespace != 0x0A )
                {
                    pdfSource.unread( whitespace );
                    //The spec says this is invalid but it happens in the real
                    //world so we must support it.
                    //throw new IOException("expected='0x0A' actual='0x" +
                    //    Integer.toHexString(whitespace) + "' " + pdfSource);
                }
            }
            else if (whitespace == 0x0A)
            {
                //that is fine
            }
            else
            {
                //we are in an error.
                //but again we will do a lenient parsing and just assume that everything
                //is fine
                pdfSource.unread( whitespace );
                //throw new IOException("expected='0x0D or 0x0A' actual='0x" +
                //Integer.toHexString(whitespace) + "' " + pdfSource);

            }


            COSBase streamLength = dic.getDictionaryObject(COSName.LENGTH);
            /*long length = -1;
            if( streamLength instanceof COSNumber )
            {
                length = ((COSNumber)streamLength).intValue();
            }
            else if( streamLength instanceof COSObject &&
                     ((COSObject)streamLength).getObject() instanceof COSNumber )
            {
                length = ((COSNumber)((COSObject)streamLength).getObject()).intValue();
            }*/

            //length = -1;
            //streamLength = null;

            //Need to keep track of the
            out = stream.createFilteredStream( streamLength );
            String endStream = null;
            //the length is wrong in some pdf documents which means
            //that PDFBox must basically ignore it in order to be able to read
            //the most number of PDF documents.  This of course is a penalty hit,
            //maybe I could implement a faster parser.
            /**if( length != -1 )
            {
                byte[] buffer = new byte[1024];
                int amountRead = 0;
                int totalAmountRead = 0;
                while( amountRead != -1 && totalAmountRead < length )
                {
                    int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead));
                    amountRead = pdfSource.read(buffer,0,maxAmountToRead);
                    totalAmountRead += amountRead;
                    if( amountRead != -1 )
                    {
                        out.write( buffer, 0, amountRead );
                    }
                }
            }
            else
            {**/
                readUntilEndStream( out );
            /**}*/
            skipSpaces();
            endStream = readString();

            if (!endStream.equals("endstream"))
            {
                readUntilEndStream( out );
                endStream = readString();
                if( !endStream.equals( "endstream" ) )
                {
                    throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource);
                }
            }
        }
        finally
        {
            if( out != null )
            {
                out.close();
            }
        }
        return stream;
    }

    private void readUntilEndStream( OutputStream out ) throws IOException
    {
        int currentIndex = 0;
        int byteRead = 0;
        //this is the additional bytes buffered but not written
        int additionalBytes=0;
        byte[] buffer = new byte[ENDSTREAM.length+additionalBytes];
        int writeIndex = 0;
        while(!cmpCircularBuffer( buffer, currentIndex, ENDSTREAM ) && byteRead != -1 )
        {
            writeIndex = currentIndex - buffer.length;
            if( writeIndex >= 0 )
            {
                out.write( buffer[writeIndex%buffer.length] );
            }
            byteRead = pdfSource.read();
            buffer[currentIndex%buffer.length] = (byte)byteRead;
            currentIndex++;
        }

        //we want to ignore the end of the line data when reading a stream
        //so will make an attempt to ignore it.
        /*writeIndex = currentIndex - buffer.length;
        if( buffer[writeIndex%buffer.length] == 13 &&
            buffer[(writeIndex+1)%buffer.length] == 10 )
        {
            //then ignore the newline before the endstream
        }
        else if( buffer[(writeIndex+1)%buffer.length] == 10 )
        {
            //Then first byte is data, second byte is newline
            out.write( buffer[writeIndex%buffer.length] );
        }
        else
        {
            out.write( buffer[writeIndex%buffer.length] );
            out.write( buffer[(writeIndex+1)%buffer.length] );
        }*/

        /**
         * Old way of handling newlines before endstream
        for( int i=0; i<additionalBytes; i++ )
        {
            writeIndex = currentIndex - buffer.length;
            if( writeIndex >=0 &&
                //buffer[writeIndex%buffer.length] != 10 &&
                buffer[writeIndex%buffer.length] != 13 )
            {
                out.write( buffer[writeIndex%buffer.length] );
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -