📄 baseparser.java

📁 非常有用的操作pdf文件的java源码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
            }
            currentIndex++;
        }
        */
        pdfSource.unread( ENDSTREAM );

    }

    /**
     * This basically checks to see if the next compareTo.length bytes of the
     * buffer match the compareTo byte array.
     */
    private boolean cmpCircularBuffer( byte[] buffer, int currentIndex, byte[] compareTo )
    {
        int cmpLen = compareTo.length;
        int buflen = buffer.length;
        boolean match = true;
        int off = currentIndex-cmpLen;
        if( off < 0 )
        {
            match = false;
        }
        for( int i=0; match && i<cmpLen; ++i )
        {
            match = buffer[(off+i)%buflen] == compareTo[i];
        }
        return match;
    }

    /**
     * This will parse a PDF string.
     *
     * @return The parsed PDF string.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected COSString parseCOSString() throws IOException
    {
        char nextChar = (char)pdfSource.read();
        COSString retval = new COSString();
        char openBrace;
        char closeBrace;
        if( nextChar == '(' )
        {
            openBrace = '(';
            closeBrace = ')';
        }
        else if( nextChar == '<' )
        {
            openBrace = '<';
            closeBrace = '>';
        }
        else
        {
            throw new IOException( "parseCOSString string should start with '(' or '<' and not '" +
                                   nextChar + "' " + pdfSource );
        }

        //This is the number of braces read
        //
        int braces = 1;
        int c = pdfSource.read();
        while( braces > 0 && c != -1)
        {
            char ch = (char)c;
            int nextc = -2; // not yet read
            //if( log.isDebugEnabled() )
            //{
            //    log.debug( "Parsing COSString character '" + c + "' code=" + (int)c );
            //}

            if(ch == closeBrace)
            {
                braces--;
                byte[] nextThreeBytes = new byte[3];
                int amountRead = pdfSource.read(nextThreeBytes);
                
                //lets handle the special case seen in Bull  River Rules and Regulations.pdf
                //The dictionary looks like this
                //    2 0 obj
                //    <<
                //        /Type /Info
                //        /Creator (PaperPort http://www.scansoft.com)
                //        /Producer (sspdflib 1.0 http://www.scansoft.com)
                //        /Title ( (5)
                //        /Author ()
                //        /Subject ()
                //
                // Notice the /Title, the braces are not even but they should
                // be.  So lets assume that if we encounter an this scenario
                //   <end_brace><new_line><opening_slash> then that
                // means that there is an error in the pdf and assume that
                // was the end of the document.
                if( amountRead == 3 )
                {
                    if( nextThreeBytes[0] == 0x0d &&
                        nextThreeBytes[1] == 0x0a &&
                        nextThreeBytes[2] == 0x2f )
                    {
                        braces = 0;
                    }
                }
                pdfSource.unread( nextThreeBytes, 0, amountRead );
                if( braces != 0 )
                {
                    retval.append( ch );
                }
            }
            else if( ch == openBrace )
            {
                braces++;
                retval.append( ch );
            }
            else if( ch == '\\' )
            {
                 //patched by ram
                char next = (char)pdfSource.read();
                switch(next)
                {
                    case 'n':
                        retval.append( '\n' );
                        break;
                    case 'r':
                        retval.append( '\r' );
                        break;
                    case 't':
                        retval.append( '\t' );
                        break;
                    case 'b':
                        retval.append( '\b' );
                        break;
                    case 'f':
                        retval.append( '\f' );
                        break;
                    case '(':
                    case ')':
                    case '\\':
                        retval.append( next );
                        break;
                    case 10:
                    case 13:
                        //this is a break in the line so ignore it and the newline and continue
                        c = pdfSource.read();
                        while( isEOL(c) && c != -1)
                        {
                            c = pdfSource.read();
                        }
                        nextc = c;
                        break;
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    {
                        StringBuffer octal = new StringBuffer();
                        octal.append( next );
                        c = pdfSource.read();
                        char digit = (char)c;
                        if( digit >= '0' && digit <= '7' )
                        {
                            octal.append( digit );
                            c = pdfSource.read();
                            digit = (char)c;
                            if( digit >= '0' && digit <= '7' )
                            {
                                octal.append( digit );
                            }
                            else 
                            {
                                nextc = c;
                            }
                        }
                        else
                        {
                            nextc = c;
                        }   

                        int character = 0;
                        try
                        {
                            character = Integer.parseInt( octal.toString(), 8 );
                        }
                        catch( NumberFormatException e )
                        {
                            throw new IOException( "Error: Expected octal character, actual='" + octal + "'" );
                        }
                        retval.append( character );
                        break;
                    }
                    default:
                    {
                        retval.append( '\\' );
                        retval.append( next );
                        //another ficken problem with PDF's, sometimes the \ doesn't really
                        //mean escape like the PDF spec says it does, sometimes is should be literal
                        //which is what we will assume here.
                        //throw new IOException( "Unexpected break sequence '" + next + "' " + pdfSource );
                    }
                }
            }
            else
            {
                if( openBrace == '<' )
                {
                    if( isHexDigit(ch) )
                    {
                        retval.append( ch );
                    }
                }
                else
                {
                    retval.append( ch );
                }
            }
            if (nextc != -2)
            {
                c = nextc;
            }
            else 
            {
                c = pdfSource.read();
            }
        }
        if (c != -1)
        {
            pdfSource.unread(c);
        }
        if( openBrace == '<' )
        {
            retval = COSString.createFromHexString( retval.getString() );
        }
        return retval;
    }

    /**
     * This will parse a PDF array object.
     *
     * @return The parsed PDF array.
     *
     * @throws IOException If there is an error parsing the stream.
     */
    protected COSArray parseCOSArray() throws IOException
    {
        char ch = (char)pdfSource.read();
        if( ch != '[')
        {
            throw new IOException( "expected='[' actual='" + ch + "'" );
        }
        COSArray po = new COSArray();
        COSBase pbo = null;
        skipSpaces();
        int i = 0;
        while( ((i = pdfSource.peek()) > 0) && ((char)i != ']') )
        {
            pbo = parseDirObject();
            if( pbo instanceof COSObject )
            {
                COSInteger genNumber = (COSInteger)po.remove( po.size() -1 );
                COSInteger number = (COSInteger)po.remove( po.size() -1 );
                COSObjectKey key = new COSObjectKey(number.intValue(), genNumber.intValue());
                pbo = document.getObjectFromPool(key);
            }
            if( pbo != null )
            {
                po.add( pbo );
            }
            else
            {
                //it could be a bad object in the array which is just skipped
            }
            skipSpaces();
        }
        pdfSource.read(); //read ']'
        skipSpaces();
        return po;
    }

    /**
     * Determine if a character terminates a PDF name.
     *
     * @param ch The character
     * @return <code>true</code> if the character terminates a PDF name, otherwise <code>false</code>.
     */
    protected boolean isEndOfName(char ch)
    {
        return (ch == ' ' || ch == 13 || ch == 10 || ch == 9 || ch == '>' || ch == '<'
            || ch == '[' || ch =='/' || ch ==']' || ch ==')' || ch =='(' ||
            ch == -1 //EOF
            );
    }

    /**
     * This will parse a PDF name from the stream.
     *
     * @return The parsed PDF name.
     *
     * @throws IOException If there is an error reading from the stream.
     */
    protected COSName parseCOSName() throws IOException
    {
        COSName retval = null;
        int c = pdfSource.read();
        if( (char)c != '/')
        {
            throw new IOException("expected='/' actual='" + (char)c + "'-" + c + " " + pdfSource );
        }
        // costruisce il nome
        StringBuffer buffer = new StringBuffer();
        c = pdfSource.read();
        while( c != -1 )
        {
            char ch = (char)c;
            if(ch == '#')
            {
                char ch1 = (char)pdfSource.read();
                char ch2 = (char)pdfSource.read();

                // Prior to PDF v1.2, the # was not a special character.  Also,
                // it has been observed that various PDF tools do not follow the
                // spec with respect to the # escape, even though they report
                // PDF versions of 1.2 or later.  The solution here is that we
                // interpret the # as an escape only when it is followed by two
                // valid hex digits.
                //
                if (isHexDigit(ch1) && isHexDigit(ch2))
                {
                    String hex = "" + ch1 + ch2;
                    try
                    {
                        buffer.append( (char) Integer.parseInt(hex, 16));
                    }
                    catch (NumberFormatException e)
                    {
                        throw new IOException("Error: expected hex number, actual='" + hex + "'");
                    }
                    c = pdfSource.read();
                }
                else
                {
                    pdfSource.unread(ch2);
                    c = ch1;
                    buffer.append( ch );
                }
            }
            else if (isEndOfName(ch))
            {
                break;
            }
            else
            {
                buffer.append( ch );
                c = pdfSource.read();
            }
        }
        if (c != -1)
        {
            pdfSource.unread(c);
        }
        retval = COSName.getPDFName( buffer.toString() );
        return retval;
    }

    /**
     * This will parse a boolean object from the stream.
     *
     * @return The parsed boolean object.
     *
     * @throws IOException If an IO error occurs during parsing.
     */
    protected COSBoolean parseBoolean() throws IOException
    {
        COSBoolean retval = null;
        char c = (char)pdfSource.peek();
        if( c == 't' )
        {
            byte[] trueArray = new byte[ 4 ];
            int amountRead = pdfSource.read( trueArray, 0, 4 );
            String trueString = new String( trueArray, 0, amountRead );
            if( !trueString.equals( "true" ) )
            {
                throw new IOException( "Error parsing boolean: expected='true' actual='" + trueString + "'" );
            }
            else
            {
                retval = COSBoolean.TRUE;
            }
        }
        else if( c == 'f' )
        {
            byte[] falseArray = new byte[ 5 ];
            int amountRead = pdfSource.read( falseArray, 0, 5 );
            String falseString = new String( falseArray, 0, amountRead );
            if( !falseString.equals( "false" ) )
            {
                throw new IOException( "Error parsing boolean: expected='true' actual='" + falseString + "'" );
            }
            else
            {
                retval = COSBoolean.FALSE;
            }
        }
        else
        {
            throw new IOException( "Error parsing boolean expected='t or f' actual='" + c + "'" );
        }
        return retval;
    }

    /**
     * This will parse a directory object from the stream.
     *
     * @return The parsed object.
     *
     * @throws IOException If there is an error during parsing.
     */
    protected COSBase parseDirObject() throws IOException
    {
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -