📄 pdfparser.java

📁 非常有用的操作pdf文件的java源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
    {
        return new PDDocument( getDocument() );
    }

    /**
     * This will get the FDF document that was parsed.  When you are done with
     * this document you must call close() on it to release resources.
     *
     * @return The document at the PD layer.
     *
     * @throws IOException If there is an error getting the document.
     */
    public FDFDocument getFDFDocument() throws IOException
    {
        return new FDFDocument( getDocument() );
    }

    /**
     * This will parse a document object from the stream.
     *
     * @return The parsed object.
     *
     * @throws IOException If an IO error occurs.
     */
    private Object parseObject() throws IOException
    {
        Object object = null;
        skipSpaces();
        char peekedChar = (char)pdfSource.peek();
        while( peekedChar == 'e' )
        {
            //there are times when there are multiple endobj, so lets
            //just read them and move on.
            readString();
            skipSpaces();
            peekedChar = (char)pdfSource.peek();
        }
        if( pdfSource.isEOF() )
        {
            //"Skipping because of EOF" );
            //end of file we will return a null object and call it a day.
        }
        else if( peekedChar == 'x' ||
                 peekedChar == 't' ||
                 peekedChar == 's')
        {
            //System.out.println( "parseObject() parsing xref" );

            //FDF documents do not always have the xref
            if( peekedChar == 'x' || peekedChar == 't' )
            {
                object = parseXrefSection();
            }
            
            //if peeked char is xref or startxref
            if( peekedChar == 'x' || peekedChar == 's')
            {
                skipSpaces();
                while( pdfSource.peek() == 'x' )
                {
                    parseXrefSection();
                }
                String startxref = readString();
                if( !startxref.equals( "startxref" ) )
                {
                    throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource );
                }
                skipSpaces();
                //read some integer that is in the stream but PDFBox doesn't use
                readInt();
            }

            //This MUST be readLine because readString strips out comments
            //and it will think that %% is a comment in from of the EOF
            String eof = readExpectedString( "%%EOF" );
            if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
            {
                throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
                                       " next=" +readString() );
            }
            else if( !pdfSource.isEOF() )
            {
                //we might really be at the end of the file, there might just be some crap at the
                //end of the file.
                pdfSource.fillBuffer();
                if( pdfSource.available() < 1000 )
                {
                    //We need to determine if we are at the end of the file.
                    byte[] data = new byte[ 1000 ];

                    int amountRead = pdfSource.read( data );
                    if( amountRead != -1 )
                    {
                        pdfSource.unread( data, 0, amountRead );
                    }
                    boolean atEndOfFile = true;//we assume yes unless we find another.
                    for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
                    {
                        atEndOfFile = !(data[i] == 'E' &&
                                        data[i+1] == 'O' &&
                                        data[i+2] == 'F' );
                    }
                    if( atEndOfFile )
                    {
                        while( pdfSource.read( data, 0, data.length ) != -1 )
                        {
                            //read until done.
                        }
                    }
                }
            }
        }
        else
        {
            int number = -1;
            int genNum = -1;
            String objectKey = null;
            boolean missingObjectNumber = false;
            try
            {
                char peeked = (char)pdfSource.peek();
                if( peeked == '<' )
                {
                    missingObjectNumber = true;
                }
                else
                {
                    number = readInt();
                }
            }
            catch( IOException e )
            {
                //ok for some reason "GNU Ghostscript 5.10" puts two endobj
                //statements after an object, of course this is nonsense
                //but because we want to support as many PDFs as possible
                //we will simply try again
                number = readInt();
            }
            if( !missingObjectNumber )
            {
                skipSpaces();
                genNum = readInt();

                objectKey = readString( 3 );
                //System.out.println( "parseObject() num=" + number + 
                //" genNumber=" + genNum + " key='" + objectKey + "'" );
                if( !objectKey.equals( "obj" ) )
                {
                    throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource );
                }
            }
            else
            {
                number = -1;
                genNum = -1;
            }

            skipSpaces();
            COSBase pb = parseDirObject();
            String endObjectKey = readString();
            if( endObjectKey.equals( "stream" ) )
            {
                pdfSource.unread( endObjectKey.getBytes() );
                pdfSource.unread( ' ' );
                if( pb instanceof COSDictionary )
                {
                    pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
                }
                else
                {
                    // this is not legal
                    // the combination of a dict and the stream/endstream forms a complete stream object
                    throw new IOException("stream not preceded by dictionary");
                }
                endObjectKey = readString();
            }
            COSObjectKey key = new COSObjectKey( number, genNum );
            COSObject pdfObject = document.getObjectFromPool( key );
            object = pdfObject;
            pdfObject.setObject(pb);

            if( !endObjectKey.equals( "endobj" ) )
            {
                if( !pdfSource.isEOF() )
                {
                    try
                    {
                        //It is possible that the endobj  is missing, there
                        //are several PDFs out there that do that so skip it and move on.
                        Float.parseFloat( endObjectKey );
                        pdfSource.unread( SPACE_BYTE );
                        pdfSource.unread( endObjectKey.getBytes() );
                    }
                    catch( NumberFormatException e )
                    {
                        //we will try again incase there was some garbage which
                        //some writers will leave behind.
                        String secondEndObjectKey = readString();
                        if( !secondEndObjectKey.equals( "endobj" ) )
                        {
                            if( isClosing() )
                            {
                                //found a case with 17506.pdf object 41 that was like this
                                //41 0 obj [/Pattern /DeviceGray] ] endobj
                                //notice the second array close, here we are reading it 
                                //and ignoring and attempting to continue
                                pdfSource.read();
                            }
                            skipSpaces();
                            String thirdPossibleEndObj = readString();
                            if( !thirdPossibleEndObj.equals( "endobj" ) )
                            {
                                throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
                                    "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
                            }
                        }
                    }
                }
            }
            skipSpaces();

        }
        //System.out.println( "parsed=" + object );
        return object;
    }


    /**
     * This will parse the xref table and trailers from the stream.
     *
     * @return a new PDFXref
     *
     * @throws IOException If an IO error occurs.
     */
    protected PDFXref parseXrefSection() throws IOException
    {
        int[] params = new int[2];
        parseXrefTable(params);
        parseTrailer();

        return new PDFXref(params[0], params[1]);
    }

    /**
     * This will parse the xref table from the stream.
     *
     * It stores the starting object number and the count
     * 
     * @param params The start and count parameters
     *
     * @throws IOException If an IO error occurs.
     */
    protected void parseXrefTable(int[] params) throws IOException
    {
        String nextLine = null;

        nextLine = readLine();
        if( nextLine.equals( "xref" ) )
        {
            params[0] = readInt();
            params[1] = readInt();
            nextLine = readString();
        }
        skipSpaces();
        while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
        {
            //skip past all the xref entries.
            nextLine = readString();
            skipSpaces();
        }
        skipSpaces();
    }

    private void parseTrailer() throws IOException
    {
        COSDictionary parsedTrailer = parseCOSDictionary();
        COSDictionary docTrailer = document.getTrailer();
        if( docTrailer == null )
        {
            document.setTrailer( parsedTrailer );
        }
        else
        {
            docTrailer.addAll( parsedTrailer );
        }
    }
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -