📄 pdfparser.java
字号:
{
return new PDDocument( getDocument() );
}
/**
* This will get the FDF document that was parsed. When you are done with
* this document you must call close() on it to release resources.
*
* @return The document at the PD layer.
*
* @throws IOException If there is an error getting the document.
*/
public FDFDocument getFDFDocument() throws IOException
{
return new FDFDocument( getDocument() );
}
/**
* This will parse a document object from the stream.
*
* @return The parsed object.
*
* @throws IOException If an IO error occurs.
*/
private Object parseObject() throws IOException
{
Object object = null;
skipSpaces();
char peekedChar = (char)pdfSource.peek();
while( peekedChar == 'e' )
{
//there are times when there are multiple endobj, so lets
//just read them and move on.
readString();
skipSpaces();
peekedChar = (char)pdfSource.peek();
}
if( pdfSource.isEOF() )
{
//"Skipping because of EOF" );
//end of file we will return a null object and call it a day.
}
else if( peekedChar == 'x' ||
peekedChar == 't' ||
peekedChar == 's')
{
//System.out.println( "parseObject() parsing xref" );
//FDF documents do not always have the xref
if( peekedChar == 'x' || peekedChar == 't' )
{
object = parseXrefSection();
}
//if peeked char is xref or startxref
if( peekedChar == 'x' || peekedChar == 's')
{
skipSpaces();
while( pdfSource.peek() == 'x' )
{
parseXrefSection();
}
String startxref = readString();
if( !startxref.equals( "startxref" ) )
{
throw new IOException( "expected='startxref' actual='" + startxref + "' " + pdfSource );
}
skipSpaces();
//read some integer that is in the stream but PDFBox doesn't use
readInt();
}
//This MUST be readLine because readString strips out comments
//and it will think that %% is a comment in from of the EOF
String eof = readExpectedString( "%%EOF" );
if( eof.indexOf( "%%EOF" )== -1 && !pdfSource.isEOF() )
{
throw new IOException( "expected='%%EOF' actual='" + eof + "' next=" + readString() +
" next=" +readString() );
}
else if( !pdfSource.isEOF() )
{
//we might really be at the end of the file, there might just be some crap at the
//end of the file.
pdfSource.fillBuffer();
if( pdfSource.available() < 1000 )
{
//We need to determine if we are at the end of the file.
byte[] data = new byte[ 1000 ];
int amountRead = pdfSource.read( data );
if( amountRead != -1 )
{
pdfSource.unread( data, 0, amountRead );
}
boolean atEndOfFile = true;//we assume yes unless we find another.
for( int i=0; i<amountRead-3 && atEndOfFile; i++ )
{
atEndOfFile = !(data[i] == 'E' &&
data[i+1] == 'O' &&
data[i+2] == 'F' );
}
if( atEndOfFile )
{
while( pdfSource.read( data, 0, data.length ) != -1 )
{
//read until done.
}
}
}
}
}
else
{
int number = -1;
int genNum = -1;
String objectKey = null;
boolean missingObjectNumber = false;
try
{
char peeked = (char)pdfSource.peek();
if( peeked == '<' )
{
missingObjectNumber = true;
}
else
{
number = readInt();
}
}
catch( IOException e )
{
//ok for some reason "GNU Ghostscript 5.10" puts two endobj
//statements after an object, of course this is nonsense
//but because we want to support as many PDFs as possible
//we will simply try again
number = readInt();
}
if( !missingObjectNumber )
{
skipSpaces();
genNum = readInt();
objectKey = readString( 3 );
//System.out.println( "parseObject() num=" + number +
//" genNumber=" + genNum + " key='" + objectKey + "'" );
if( !objectKey.equals( "obj" ) )
{
throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource );
}
}
else
{
number = -1;
genNum = -1;
}
skipSpaces();
COSBase pb = parseDirObject();
String endObjectKey = readString();
if( endObjectKey.equals( "stream" ) )
{
pdfSource.unread( endObjectKey.getBytes() );
pdfSource.unread( ' ' );
if( pb instanceof COSDictionary )
{
pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream forms a complete stream object
throw new IOException("stream not preceded by dictionary");
}
endObjectKey = readString();
}
COSObjectKey key = new COSObjectKey( number, genNum );
COSObject pdfObject = document.getObjectFromPool( key );
object = pdfObject;
pdfObject.setObject(pb);
if( !endObjectKey.equals( "endobj" ) )
{
if( !pdfSource.isEOF() )
{
try
{
//It is possible that the endobj is missing, there
//are several PDFs out there that do that so skip it and move on.
Float.parseFloat( endObjectKey );
pdfSource.unread( SPACE_BYTE );
pdfSource.unread( endObjectKey.getBytes() );
}
catch( NumberFormatException e )
{
//we will try again incase there was some garbage which
//some writers will leave behind.
String secondEndObjectKey = readString();
if( !secondEndObjectKey.equals( "endobj" ) )
{
if( isClosing() )
{
//found a case with 17506.pdf object 41 that was like this
//41 0 obj [/Pattern /DeviceGray] ] endobj
//notice the second array close, here we are reading it
//and ignoring and attempting to continue
pdfSource.read();
}
skipSpaces();
String thirdPossibleEndObj = readString();
if( !thirdPossibleEndObj.equals( "endobj" ) )
{
throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
"secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
}
}
}
}
}
skipSpaces();
}
//System.out.println( "parsed=" + object );
return object;
}
/**
* This will parse the xref table and trailers from the stream.
*
* @return a new PDFXref
*
* @throws IOException If an IO error occurs.
*/
protected PDFXref parseXrefSection() throws IOException
{
int[] params = new int[2];
parseXrefTable(params);
parseTrailer();
return new PDFXref(params[0], params[1]);
}
/**
* This will parse the xref table from the stream.
*
* It stores the starting object number and the count
*
* @param params The start and count parameters
*
* @throws IOException If an IO error occurs.
*/
protected void parseXrefTable(int[] params) throws IOException
{
String nextLine = null;
nextLine = readLine();
if( nextLine.equals( "xref" ) )
{
params[0] = readInt();
params[1] = readInt();
nextLine = readString();
}
skipSpaces();
while( !nextLine.equals( "trailer" ) && !pdfSource.isEOF() && !isEndOfName((char)pdfSource.peek()))
{
//skip past all the xref entries.
nextLine = readString();
skipSpaces();
}
skipSpaces();
}
private void parseTrailer() throws IOException
{
COSDictionary parsedTrailer = parseCOSDictionary();
COSDictionary docTrailer = document.getTrailer();
if( docTrailer == null )
{
document.setTrailer( parsedTrailer );
}
else
{
docTrailer.addAll( parsedTrailer );
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -