📄 pdftextstripper.java

📁 非常有用的操作pdf文件的java源码
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
                numberOfArticleSections = 1;
            }
            int originalSize = charactersByArticle.size();
            charactersByArticle.setSize( numberOfArticleSections );
            for( int i=0; i<numberOfArticleSections; i++ )
            {
                if( numberOfArticleSections < originalSize )
                {
                    ((List)charactersByArticle.get( i )).clear();
                }
                else
                {
                    charactersByArticle.set( i, new ArrayList() );
                }
            }
            
            characterListMapping.clear();
            processStream( page, page.findResources(), content );
            flushText();
            endPage( page );
        }
        
    }
    
    /**
     * Start a new paragraph.  Default implementation is to do nothing.  Subclasses
     * may provide additional information.
     * 
     * @throws IOException If there is any error writing to the stream.
     */
    protected void startParagraph() throws IOException
    {
        //default is to do nothing.
    }
    
    /**
     * End a paragraph.  Default implementation is to do nothing.  Subclasses
     * may provide additional information.
     * 
     * @throws IOException If there is any error writing to the stream.
     */
    protected void endParagraph() throws IOException
    {
        //default is to do nothing
    }
    
    /**
     * Start a new page.  Default implementation is to do nothing.  Subclasses
     * may provide additional information.
     * 
     * @param page The page we are about to process.
     * 
     * @throws IOException If there is any error writing to the stream.
     */
    protected void startPage( PDPage page ) throws IOException
    {
        //default is to do nothing.
    }
    
    /**
     * End a page.  Default implementation is to do nothing.  Subclasses
     * may provide additional information.
     * 
     * @param page The page we are about to process.
     * 
     * @throws IOException If there is any error writing to the stream.
     */
    protected void endPage( PDPage page ) throws IOException
    {
        //default is to do nothing
    }

    /**
     * This will print the text to the output stream.
     *
     * @throws IOException If there is an error writing the text.
     */
    protected void flushText() throws IOException
    {
        float currentY = -1;
        float lastBaselineFontSize = -1;
        float endOfLastTextX = -1;
        float startOfNextWordX = -1;
        float lastWordSpacing = -1;
        TextPosition lastProcessedCharacter = null;
        
        for( int i=0; i<charactersByArticle.size(); i++)
        {
            startParagraph();
            List textList = (List)charactersByArticle.get( i );
            if( sortByPosition )
            {
                TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() );
                Collections.sort( textList, comparator );
            }
            Iterator textIter = textList.iterator();
            while( textIter.hasNext() )
            {
                TextPosition position = (TextPosition)textIter.next();
                String characterValue = position.getCharacter();
                
                //wordSpacing = position.getWordSpacing();
                float wordSpacing = 0;
                
                if( wordSpacing == 0 )
                {
                    //try to get width of a space character
                    wordSpacing = position.getWidthOfSpace();
                    //if still zero fall back to getting the width of the current
                    //character
                    if( wordSpacing == 0 )
                    {
                        wordSpacing = position.getWidth();
                    }
                }
                
                
                // RDD - We add a conservative approximation for space determination.
                // basically if there is a blank area between two characters that is
                //equal to some percentage of the word spacing then that will be the
                //start of the next word
                if( lastWordSpacing <= 0 )
                {
                    startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
                }
                else
                {
                    startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
                }
                
                lastWordSpacing = wordSpacing;
    
                // RDD - We will suppress text that is very close to the current line
                // and which overwrites previously rendered text on this line.
                // This is done specifically to handle a reasonably common situation
                // where an application (MS Word, in the case of my examples) renders
                // text four times at small (1 point) offsets in order to accomplish
                // bold printing.  You would not want to do this step if you were
                // going to render the TextPosition objects graphically.
                //
                /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
                    (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
                {
                    if (log.isDebugEnabled())
                    {
                        log.debug("Suppressing text overwrite" +
                                  " x: " + position.getX() +
                                  " endOfLastTextX: " + endOfLastTextX +
                                  " string: " + position.getCharacter());
                    }
                    continue;
                }*/
    
                // RDD - Here we determine whether this text object is on the current
                // line.  We use the lastBaselineFontSize to handle the superscript
                // case, and the size of the current font to handle the subscript case.
                // Text must overlap with the last rendered baseline text by at least
                // a small amount in order to be considered as being on the same line.
                //
                int verticalScaling = 1;
                if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
                {
                    verticalScaling = -1;
                }
                if (currentY != -1 &&
                    ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
                     (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
                {
                    output.write(getLineSeparator());
                    endOfLastTextX = -1;
                    startOfNextWordX = -1;
                    currentY = -1;
                    lastBaselineFontSize = -1;
                }
    
                if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
                   lastProcessedCharacter != null &&
                   //only bother adding a space if the last character was not a space
                   lastProcessedCharacter.getCharacter() != null &&
                   !lastProcessedCharacter.getCharacter().endsWith( " " ) )
                {
                    output.write(getWordSeparator());
                }
    
                if (currentY == -1)
                {
                    currentY = position.getY();
                }
    
                if (currentY == position.getY())
                {
                    lastBaselineFontSize = position.getFontSize();
                }
    
                // RDD - endX is what PDF considers to be the x coordinate of the
                // end position of the text.  We use it in computing our metrics below.
                //
                endOfLastTextX = position.getX() + position.getWidth();
    
    
                if (characterValue != null)
                {
                    writeCharacters( position );
                }
                else
                {
                    //Position.getString() is null so not writing anything
                }
                lastProcessedCharacter = position;
            }
            endParagraph();
        }
        

        // RDD - newline at end of flush - required for end of page (so that the top
        // of the next page starts on its own line.
        //
        output.write(getPageSeparator());

        output.flush();
    }
    
    /**
     * Write the string to the output stream.
     *  
     * @param text The text to write to the stream.
     * @throws IOException If there is an error when writing the text.
     */
    protected void writeCharacters( TextPosition text ) throws IOException
    {
        output.write( text.getCharacter() );
    }

    /**
     * This will determine of two floating point numbers are within a specified variance.
     *
     * @param first The first number to compare to.
     * @param second The second number to compare to.
     * @param variance The allowed variance.
     */
    private boolean within( float first, float second, float variance )
    {
        return second > first - variance && second < first + variance;
    }

    /**
     * This will show add a character to the list of characters to be printed to
     * the text file.
     *
     * @param text The description of the character to display.
     */
    protected void showCharacter( TextPosition text )
    {
        boolean showCharacter = true;
        if( suppressDuplicateOverlappingText )
        {
            showCharacter = false;
            String textCharacter = text.getCharacter();
            float textX = text.getX();
            float textY = text.getY();
            List sameTextCharacters = (List)characterListMapping.get( textCharacter );
            if( sameTextCharacters == null )
            {
                sameTextCharacters = new ArrayList();
                characterListMapping.put( textCharacter, sameTextCharacters );
            }
    
            // RDD - Here we compute the value that represents the end of the rendered
            // text.  This value is used to determine whether subsequent text rendered
            // on the same line overwrites the current text.
            //
            // We subtract any positive padding to handle cases where extreme amounts
            // of padding are applied, then backed off (not sure why this is done, but there
            // are cases where the padding is on the order of 10x the character width, and
            // the TJ just backs up to compensate after each character).  Also, we subtract
            // an amount to allow for kerning (a percentage of the width of the last
            // character).
            //
            boolean suppressCharacter = false;
            float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
            for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
            {
                TextPosition character = (TextPosition)sameTextCharacters.get( i );
                String charCharacter = character.getCharacter();
                float charX = character.getX();
                float charY = character.getY();
                //only want to suppress
                
                if( charCharacter != null &&
                    //charCharacter.equals( textCharacter ) &&
                    within( charX, textX, tolerance ) &&
                    within( charY, 
                            textY, 
                            tolerance ) )
                {
                    suppressCharacter = true;
                }
            }
            if( !suppressCharacter )
            {
                sameTextCharacters.add( text );
                showCharacter = true;
            }
        }
        
        if( showCharacter )
        {
            //if we are showing the character then we need to determine which
            //article it belongs to.
            int foundArticleDivisionIndex = -1;
            int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
            int notFoundButFirstLeftArticleDivisionIndex = -1;
            int notFoundButFirstAboveArticleDivisionIndex = -1;
            float x = text.getX();
            float y = text.getY();
            if( shouldSeparateByBeads )
            {
                for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -