📄 pdftextstripper.java
字号:
/**
* Copyright (c) 2003-2005, www.pdfbox.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of pdfbox; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://www.pdfbox.org
*
*/
package org.pdfbox.util;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Vector;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSStream;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.common.PDRectangle;
import org.pdfbox.pdmodel.common.PDStream;
import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
/**
* This class will take a pdf document and strip out all of the text and ignore the
* formatting and such.
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.69 $
*/
public class PDFTextStripper extends PDFStreamEngine
{
private int currentPageNo = 0;
private int startPage = 1;
private int endPage = Integer.MAX_VALUE;
private PDOutlineItem startBookmark = null;
private int startBookmarkPageNumber = -1;
private PDOutlineItem endBookmark = null;
private int endBookmarkPageNumber = -1;
private PDDocument document;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
private List pageArticles = null;
/**
* The charactersByArticle is used to extract text by article divisions. For example
* a PDF that has two columns like a newspaper, we want to extract the first column and
* then the second column. In this example the PDF would have 2 beads(or articles), one for
* each column. The size of the charactersByArticle would be 5, because not all text on the
* screen will fall into one of the articles. The five divisions are shown below
*
* Text before first article
* first article text
* text between first article and second article
* second article text
* text after second article
*
* Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
*/
protected Vector charactersByArticle = new Vector();
private Map characterListMapping = new HashMap();
private String lineSeparator = System.getProperty("line.separator");
private String pageSeparator = System.getProperty("line.separator");
private String wordSeparator = " ";
/**
* The stream to write the output to.
*/
protected Writer output;
/**
* Instantiate a new PDFTextStripper object. This object will load properties from
* Resources/PDFTextStripper.properties.
* @throws IOException If there is an error loading the properties.
*/
public PDFTextStripper() throws IOException
{
super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
}
/**
* Instantiate a new PDFTextStripper object. Loading all of the operator mappings
* from the properties object that is passed in.
*
* @param props The properties containing the mapping of operators to PDFOperator
* classes.
*
* @throws IOException If there is an error reading the properties.
*/
public PDFTextStripper( Properties props ) throws IOException
{
super( props );
}
/**
* This will return the text of a document. See writeText. <br />
* NOTE: The document must not be encrypted when coming into this method.
*
* @param doc The document to get the text from.
*
* @return The text of the PDF document.
*
* @throws IOException if the doc state is invalid or it is encrypted.
*/
public String getText( PDDocument doc ) throws IOException
{
StringWriter outputStream = new StringWriter();
writeText( doc, outputStream );
return outputStream.toString();
}
/**
* @deprecated
* @see PDFTextStripper#getText( PDDocument )
* @param doc The document to extract the text from.
* @return The document text.
* @throws IOException If there is an error extracting the text.
*/
public String getText( COSDocument doc ) throws IOException
{
return getText( new PDDocument( doc ) );
}
/**
* @deprecated
* @see PDFTextStripper#writeText( PDDocument, Writer )
* @param doc The document to extract the text.
* @param outputStream The stream to write the text to.
* @throws IOException If there is an error extracting the text.
*/
public void writeText( COSDocument doc, Writer outputStream ) throws IOException
{
writeText( new PDDocument( doc ), outputStream );
}
/**
* This will take a PDDocument and write the text of that document to the print writer.
*
* @param doc The document to get the data from.
* @param outputStream The location to put the text.
*
* @throws IOException If the doc is in an invalid state.
*/
public void writeText( PDDocument doc, Writer outputStream ) throws IOException
{
resetEngine();
currentPageNo = 0;
document = doc;
output = outputStream;
startDocument(document);
if( document.isEncrypted() )
{
// We are expecting non-encrypted documents here, but it is common
// for users to pass in a document that is encrypted with an empty
// password (such a document appears to not be encrypted by
// someone viewing the document, thus the confusion). We will
// attempt to decrypt with the empty password to handle this case.
//
try
{
document.decrypt("");
}
catch (CryptographyException e)
{
throw new IOException("Error decrypting document, details: " + e.getMessage());
}
catch (InvalidPasswordException e)
{
throw new IOException("Error: document is encrypted");
}
}
processPages( document.getDocumentCatalog().getAllPages() );
endDocument(document);
}
/**
* This will process all of the pages and the text that is in them.
*
* @param pages The pages object in the document.
*
* @throws IOException If there is an error parsing the text.
*/
protected void processPages( List pages ) throws IOException
{
if( startBookmark != null )
{
startBookmarkPageNumber = getPageNumber( startBookmark, pages );
}
if( endBookmark != null )
{
endBookmarkPageNumber = getPageNumber( endBookmark, pages );
}
if( startBookmarkPageNumber == -1 && startBookmark != null &&
endBookmarkPageNumber == -1 && endBookmark != null &&
startBookmark.getCOSObject() == endBookmark.getCOSObject() )
{
//this is a special case where both the start and end bookmark
//are the same but point to nothing. In this case
//we will not extract any text.
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
}
Iterator pageIter = pages.iterator();
while( pageIter.hasNext() )
{
PDPage nextPage = (PDPage)pageIter.next();
PDStream contentStream = nextPage.getContents();
if( contentStream != null )
{
COSStream contents = contentStream.getStream();
processPage( nextPage, contents );
}
}
}
private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException
{
int pageNumber = -1;
PDPage page = bookmark.findDestinationPage( document );
if( page != null )
{
pageNumber = allPages.indexOf( page )+1;//use one based indexing
}
return pageNumber;
}
/**
* This method is available for subclasses of this class. It will be called before processing
* of the document start.
*
* @param pdf The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
protected void startDocument(PDDocument pdf) throws IOException
{
// no default implementation, but available for subclasses
}
/**
* This method is available for subclasses of this class. It will be called after processing
* of the document finishes.
*
* @param pdf The PDF document that is being processed.
* @throws IOException If an IO error occurs.
*/
protected void endDocument(PDDocument pdf ) throws IOException
{
// no default implementation, but available for subclasses
}
/**
* This will process the contents of a page.
*
* @param page The page to process.
* @param content The contents of the page.
*
* @throws IOException If there is an error processing the page.
*/
protected void processPage( PDPage page, COSStream content ) throws IOException
{
currentPageNo++;
if( currentPageNo >= startPage && currentPageNo <= endPage &&
(startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
(endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
{
startPage( page );
pageArticles = page.getThreadBeads();
int numberOfArticleSections = 1 + pageArticles.size() * 2;
if( !shouldSeparateByBeads )
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -