📄 pdftextstripper.java
字号:
{
PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
if( bead != null )
{
PDRectangle rect = bead.getRectangle();
if( rect.contains( x, y ) )
{
foundArticleDivisionIndex = i*2+1;
}
else if( (x < rect.getLowerLeftX() ||
y < rect.getUpperRightY()) &&
notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
{
notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
}
else if( x < rect.getLowerLeftX() &&
notFoundButFirstLeftArticleDivisionIndex == -1)
{
notFoundButFirstLeftArticleDivisionIndex = i*2;
}
else if( y < rect.getUpperRightY() &&
notFoundButFirstAboveArticleDivisionIndex == -1)
{
notFoundButFirstAboveArticleDivisionIndex = i*2;
}
}
else
{
foundArticleDivisionIndex = 0;
}
}
}
else
{
foundArticleDivisionIndex = 0;
}
int articleDivisionIndex = -1;
if( foundArticleDivisionIndex != -1 )
{
articleDivisionIndex = foundArticleDivisionIndex;
}
else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
{
articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
}
else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
{
articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
}
else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
{
articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
}
else
{
articleDivisionIndex = charactersByArticle.size()-1;
}
List textList = (List) charactersByArticle.get( articleDivisionIndex );
textList.add( text );
}
}
/**
* This is the page that the text extraction will start on. The pages start
* at page 1. For example in a 5 page PDF document, if the start page is 1
* then all pages will be extracted. If the start page is 4 then pages 4 and 5
* will be extracted. The default value is 1.
*
* @return Value of property startPage.
*/
public int getStartPage()
{
return startPage;
}
/**
* This will set the first page to be extracted by this class.
*
* @param startPageValue New value of property startPage.
*/
public void setStartPage(int startPageValue)
{
startPage = startPageValue;
}
/**
* This will get the last page that will be extracted. This is inclusive,
* for example if a 5 page PDF an endPage value of 5 would extract the
* entire document, an end page of 2 would extract pages 1 and 2. This defaults
* to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
*
* @return Value of property endPage.
*/
public int getEndPage()
{
return endPage;
}
/**
* This will set the last page to be extracted by this class.
*
* @param endPageValue New value of property endPage.
*/
public void setEndPage(int endPageValue)
{
endPage = endPageValue;
}
/**
* Set the desired line separator for output text. The line.separator
* system property is used if the line separator preference is not set
* explicitly using this method.
*
* @param separator The desired line separator string.
*/
public void setLineSeparator(String separator)
{
lineSeparator = separator;
}
/**
* This will get the line separator.
*
* @return The desired line separator string.
*/
public String getLineSeparator()
{
return lineSeparator;
}
/**
* Set the desired page separator for output text. The line.separator
* system property is used if the page separator preference is not set
* explicitly using this method.
*
* @param separator The desired page separator string.
*/
public void setPageSeparator(String separator)
{
pageSeparator = separator;
}
/**
* This will get the word separator.
*
* @return The desired word separator string.
*/
public String getWordSeparator()
{
return wordSeparator;
}
/**
* Set the desired word separator for output text. The PDFBox text extraction
* algorithm will output a space character if there is enough space between
* two words. By default a space character is used. If you need and accurate
* count of characters that are found in a PDF document then you might want to
* set the word separator to the empty string.
*
* @param separator The desired page separator string.
*/
public void setWordSeparator(String separator)
{
wordSeparator = separator;
}
/**
* This will get the page separator.
*
* @return The page separator string.
*/
public String getPageSeparator()
{
return pageSeparator;
}
/**
* @return Returns the suppressDuplicateOverlappingText.
*/
public boolean shouldSuppressDuplicateOverlappingText()
{
return suppressDuplicateOverlappingText;
}
/**
* Get the current page number that is being processed.
*
* @return A 1 based number representing the current page.
*/
protected int getCurrentPageNo()
{
return currentPageNo;
}
/**
* The output stream that is being written to.
*
* @return The stream that output is being written to.
*/
protected Writer getOutput()
{
return output;
}
/**
* Character strings are grouped by articles. It is quite common that there
* will only be a single article. This returns a List that contains List objects,
* the inner lists will contain TextPosition objects.
*
* @return A double List of TextPositions for all text strings on the page.
*/
protected List getCharactersByArticle()
{
return charactersByArticle;
}
/**
* By default the text stripper will attempt to remove text that overlapps each other.
* Word paints the same character several times in order to make it look bold. By setting
* this to false all text will be extracted, which means that certain sections will be
* duplicated, but better performance will be noticed.
*
* @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
*/
public void setSuppressDuplicateOverlappingText(
boolean suppressDuplicateOverlappingTextValue)
{
this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
}
/**
* This will tell if the text stripper should separate by beads.
*
* @return If the text will be grouped by beads.
*/
public boolean shouldSeparateByBeads()
{
return shouldSeparateByBeads;
}
/**
* Set if the text stripper should group the text output by a list of beads. The default value is true!
*
* @param aShouldSeparateByBeads The new grouping of beads.
*/
public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
{
this.shouldSeparateByBeads = aShouldSeparateByBeads;
}
/**
* Get the bookmark where text extraction should end, inclusive. Default is null.
*
* @return The ending bookmark.
*/
public PDOutlineItem getEndBookmark()
{
return endBookmark;
}
/**
* Set the bookmark where the text extraction should stop.
*
* @param aEndBookmark The ending bookmark.
*/
public void setEndBookmark(PDOutlineItem aEndBookmark)
{
endBookmark = aEndBookmark;
}
/**
* Get the bookmark where text extraction should start, inclusive. Default is null.
*
* @return The starting bookmark.
*/
public PDOutlineItem getStartBookmark()
{
return startBookmark;
}
/**
* Set the bookmark where text extraction should start, inclusive.
*
* @param aStartBookmark The starting bookmark.
*/
public void setStartBookmark(PDOutlineItem aStartBookmark)
{
startBookmark = aStartBookmark;
}
/**
* This will tell if the text stripper should sort the text tokens
* before writing to the stream.
*
* @return true If the text tokens will be sorted before being written.
*/
public boolean shouldSortByPosition()
{
return sortByPosition;
}
/**
* The order of the text tokens in a PDF file may not be in the same
* as they appear visually on the screen. For example, a PDF writer may
* write out all text by font, so all bold or larger text, then make a second
* pass and write out the normal text.<br/>
* The default is to <b>not</b> sort by position.<br/>
* <br/>
* A PDF writer could choose to write each character in a different order. By
* default PDFBox does <b>not</b> sort the text tokens before processing them due to
* performance reasons.
*
* @param newSortByPosition Tell PDFBox to sort the text positions.
*/
public void setSortByPosition(boolean newSortByPosition)
{
sortByPosition = newSortByPosition;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -