📄 pdfparser.java
字号:
/**
* Copyright (c) 2003-2006, www.pdfbox.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of pdfbox; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://www.pdfbox.org
*
*/
package org.pdfbox.pdfparser;
import java.io.File;
import java.io.InputStream;
import java.io.IOException;
import java.util.Iterator;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSObject;
import org.pdfbox.cos.COSStream;
import org.pdfbox.exceptions.WrappedIOException;
import org.pdfbox.io.RandomAccess;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.fdf.FDFDocument;
import org.pdfbox.persistence.util.COSObjectKey;
/**
* This class will handle the parsing of the PDF document.
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.53 $
*/
public class PDFParser extends BaseParser
{
private static final int SPACE_BYTE = 32;
private static final String PDF_HEADER = "%PDF-";
private COSDocument document;
/**
* Temp file directory.
*/
private File tempDirectory = null;
private RandomAccess raf = null;
/**
* Constructor.
*
* @param input The input stream that contains the PDF document.
*
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser( InputStream input ) throws IOException
{
this(input, null);
}
/**
* Constructor to allow control over RandomAccessFile.
* @param input The input stream that contains the PDF document.
* @param rafi The RandomAccessFile to be used in internal COSDocument
*
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser(InputStream input, RandomAccess rafi)
throws IOException
{
super(input);
this.raf = rafi;
}
/**
* This is the directory where pdfbox will create a temporary file
* for storing pdf document stream in. By default this directory will
* be the value of the system property java.io.tmpdir.
*
* @param tmpDir The directory to create scratch files needed to store
* pdf document streams.
*/
public void setTempDirectory( File tmpDir )
{
tempDirectory = tmpDir;
}
/**
* This will prase the stream and create the PDF document. This will close
* the stream when it is done parsing.
*
* @throws IOException If there is an error reading from the stream.
*/
public void parse() throws IOException
{
try
{
if ( raf == null )
{
if( tempDirectory != null )
{
document = new COSDocument( tempDirectory );
}
else
{
document = new COSDocument();
}
}
else
{
document = new COSDocument( raf );
}
setDocument( document );
String header = readLine();
document.setHeaderString( header );
if( header.length() < PDF_HEADER.length()+1 )
{
throw new IOException( "Error: Header is corrupt '" + header + "'" );
}
//sometimes there are some garbage bytes in the header before the header
//actually starts, so lets try to find the header first.
int headerStart = header.indexOf( PDF_HEADER );
//greater than zero because if it is zero then
//there is no point of trimming
if( headerStart > 0 )
{
//trim off any leading characters
header = header.substring( headerStart, header.length() );
}
try
{
float pdfVersion = Float.parseFloat(
header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER.length()+3) ) );
document.setVersion( pdfVersion );
}
catch( NumberFormatException e )
{
throw new IOException( "Error getting pdf version:" + e );
}
skipHeaderFillBytes();
Object nextObject;
boolean wasLastParsedObjectAnXref = false;
try
{
while( (nextObject = parseObject()) != null )
{
if( nextObject instanceof PDFXref )
{
PDFXref xref = (PDFXref)nextObject;
addXref(xref);
wasLastParsedObjectAnXref = true;
}
else
{
wasLastParsedObjectAnXref = false;
}
skipSpaces();
}
if( document.getTrailer() == null )
{
COSDictionary trailer = new COSDictionary();
Iterator xrefIter = document.getObjectsByType( "XRef" ).iterator();
while( xrefIter.hasNext() )
{
COSStream next = (COSStream)((COSObject)xrefIter.next()).getObject();
trailer.addAll( next );
}
document.setTrailer( trailer );
}
if( !document.isEncrypted() )
{
document.dereferenceObjectStreams();
}
}
catch( IOException e )
{
if( wasLastParsedObjectAnXref )
{
//Then we assume that there is just random garbage after
//the xref, not sure why the PDF spec allows this but it does.
}
else
{
//some other error so just pass it along
throw e;
}
}
}
catch( Throwable t )
{
//so if the PDF is corrupt then close the document and clear
//all resources to it
if( document != null )
{
document.close();
}
if( t instanceof IOException )
{
throw (IOException)t;
}
else
{
throw new WrappedIOException( t );
}
}
finally
{
pdfSource.close();
}
}
/**
* This will skip a header's binary fill bytes. This is in accordance to
* PDF Specification 1.5 pg 68 section 3.4.1 "Syntax.File Structure.File Header"
*
* @throws IOException If there is an error reading from the stream.
*/
protected void skipHeaderFillBytes() throws IOException
{
skipSpaces();
int c = pdfSource.peek();
if( !Character.isDigit( (char)c ) )
{
// Fill bytes conform with PDF reference (but without comment sign)
// => skip until EOL
readLine();
}
// else: no fill bytes
}
/**
* This will get the document that was parsed. parse() must be called before this is called.
* When you are done with this document you must call close() on it to release
* resources.
*
* @return The document that was parsed.
*
* @throws IOException If there is an error getting the document.
*/
public COSDocument getDocument() throws IOException
{
if( document == null )
{
throw new IOException( "You must call parse() before calling getDocument()" );
}
return document;
}
/**
* This will get the PD document that was parsed. When you are done with
* this document you must call close() on it to release resources.
*
* @return The document at the PD layer.
*
* @throws IOException If there is an error getting the document.
*/
public PDDocument getPDDocument() throws IOException
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -