📄 pdftextstripper.java
字号:
numberOfArticleSections = 1;
}
int originalSize = charactersByArticle.size();
charactersByArticle.setSize( numberOfArticleSections );
for( int i=0; i<numberOfArticleSections; i++ )
{
if( numberOfArticleSections < originalSize )
{
((List)charactersByArticle.get( i )).clear();
}
else
{
charactersByArticle.set( i, new ArrayList() );
}
}
characterListMapping.clear();
processStream( page, page.findResources(), content );
flushText();
endPage( page );
}
}
/**
* Start a new paragraph. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void startParagraph() throws IOException
{
//default is to do nothing.
}
/**
* End a paragraph. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void endParagraph() throws IOException
{
//default is to do nothing
}
/**
* Start a new page. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void startPage( PDPage page ) throws IOException
{
//default is to do nothing.
}
/**
* End a page. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void endPage( PDPage page ) throws IOException
{
//default is to do nothing
}
/**
* This will print the text to the output stream.
*
* @throws IOException If there is an error writing the text.
*/
protected void flushText() throws IOException
{
float currentY = -1;
float lastBaselineFontSize = -1;
float endOfLastTextX = -1;
float startOfNextWordX = -1;
float lastWordSpacing = -1;
TextPosition lastProcessedCharacter = null;
for( int i=0; i<charactersByArticle.size(); i++)
{
startParagraph();
List textList = (List)charactersByArticle.get( i );
if( sortByPosition )
{
TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() );
Collections.sort( textList, comparator );
}
Iterator textIter = textList.iterator();
while( textIter.hasNext() )
{
TextPosition position = (TextPosition)textIter.next();
String characterValue = position.getCharacter();
//wordSpacing = position.getWordSpacing();
float wordSpacing = 0;
if( wordSpacing == 0 )
{
//try to get width of a space character
wordSpacing = position.getWidthOfSpace();
//if still zero fall back to getting the width of the current
//character
if( wordSpacing == 0 )
{
wordSpacing = position.getWidth();
}
}
// RDD - We add a conservative approximation for space determination.
// basically if there is a blank area between two characters that is
//equal to some percentage of the word spacing then that will be the
//start of the next word
if( lastWordSpacing <= 0 )
{
startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
}
else
{
startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
}
lastWordSpacing = wordSpacing;
// RDD - We will suppress text that is very close to the current line
// and which overwrites previously rendered text on this line.
// This is done specifically to handle a reasonably common situation
// where an application (MS Word, in the case of my examples) renders
// text four times at small (1 point) offsets in order to accomplish
// bold printing. You would not want to do this step if you were
// going to render the TextPosition objects graphically.
//
/*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
(currentY != -1 && Math.abs(position.getY() - currentY) < 1))
{
if (log.isDebugEnabled())
{
log.debug("Suppressing text overwrite" +
" x: " + position.getX() +
" endOfLastTextX: " + endOfLastTextX +
" string: " + position.getCharacter());
}
continue;
}*/
// RDD - Here we determine whether this text object is on the current
// line. We use the lastBaselineFontSize to handle the superscript
// case, and the size of the current font to handle the subscript case.
// Text must overlap with the last rendered baseline text by at least
// a small amount in order to be considered as being on the same line.
//
int verticalScaling = 1;
if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
{
verticalScaling = -1;
}
if (currentY != -1 &&
((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
(position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
{
output.write(getLineSeparator());
endOfLastTextX = -1;
startOfNextWordX = -1;
currentY = -1;
lastBaselineFontSize = -1;
}
if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
lastProcessedCharacter != null &&
//only bother adding a space if the last character was not a space
lastProcessedCharacter.getCharacter() != null &&
!lastProcessedCharacter.getCharacter().endsWith( " " ) )
{
output.write(getWordSeparator());
}
if (currentY == -1)
{
currentY = position.getY();
}
if (currentY == position.getY())
{
lastBaselineFontSize = position.getFontSize();
}
// RDD - endX is what PDF considers to be the x coordinate of the
// end position of the text. We use it in computing our metrics below.
//
endOfLastTextX = position.getX() + position.getWidth();
if (characterValue != null)
{
writeCharacters( position );
}
else
{
//Position.getString() is null so not writing anything
}
lastProcessedCharacter = position;
}
endParagraph();
}
// RDD - newline at end of flush - required for end of page (so that the top
// of the next page starts on its own line.
//
output.write(getPageSeparator());
output.flush();
}
/**
* Write the string to the output stream.
*
* @param text The text to write to the stream.
* @throws IOException If there is an error when writing the text.
*/
protected void writeCharacters( TextPosition text ) throws IOException
{
output.write( text.getCharacter() );
}
/**
* This will determine of two floating point numbers are within a specified variance.
*
* @param first The first number to compare to.
* @param second The second number to compare to.
* @param variance The allowed variance.
*/
private boolean within( float first, float second, float variance )
{
return second > first - variance && second < first + variance;
}
/**
* This will show add a character to the list of characters to be printed to
* the text file.
*
* @param text The description of the character to display.
*/
protected void showCharacter( TextPosition text )
{
boolean showCharacter = true;
if( suppressDuplicateOverlappingText )
{
showCharacter = false;
String textCharacter = text.getCharacter();
float textX = text.getX();
float textY = text.getY();
List sameTextCharacters = (List)characterListMapping.get( textCharacter );
if( sameTextCharacters == null )
{
sameTextCharacters = new ArrayList();
characterListMapping.put( textCharacter, sameTextCharacters );
}
// RDD - Here we compute the value that represents the end of the rendered
// text. This value is used to determine whether subsequent text rendered
// on the same line overwrites the current text.
//
// We subtract any positive padding to handle cases where extreme amounts
// of padding are applied, then backed off (not sure why this is done, but there
// are cases where the padding is on the order of 10x the character width, and
// the TJ just backs up to compensate after each character). Also, we subtract
// an amount to allow for kerning (a percentage of the width of the last
// character).
//
boolean suppressCharacter = false;
float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
{
TextPosition character = (TextPosition)sameTextCharacters.get( i );
String charCharacter = character.getCharacter();
float charX = character.getX();
float charY = character.getY();
//only want to suppress
if( charCharacter != null &&
//charCharacter.equals( textCharacter ) &&
within( charX, textX, tolerance ) &&
within( charY,
textY,
tolerance ) )
{
suppressCharacter = true;
}
}
if( !suppressCharacter )
{
sameTextCharacters.add( text );
showCharacter = true;
}
}
if( showCharacter )
{
//if we are showing the character then we need to determine which
//article it belongs to.
int foundArticleDivisionIndex = -1;
int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
int notFoundButFirstLeftArticleDivisionIndex = -1;
int notFoundButFirstAboveArticleDivisionIndex = -1;
float x = text.getX();
float y = text.getY();
if( shouldSeparateByBeads )
{
for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -