📄 baseparser.java
字号:
/**
* Copyright (c) 2003-2006, www.pdfbox.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the name of pdfbox; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://www.pdfbox.org
*
*/
package org.pdfbox.pdfparser;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import org.pdfbox.io.ByteArrayPushBackInputStream;
import org.pdfbox.io.PushBackInputStream;
import org.pdfbox.io.RandomAccess;
import org.pdfbox.cos.COSArray;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSBoolean;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSInteger;
import org.pdfbox.cos.COSName;
import org.pdfbox.cos.COSNull;
import org.pdfbox.cos.COSNumber;
import org.pdfbox.cos.COSObject;
import org.pdfbox.cos.COSStream;
import org.pdfbox.cos.COSString;
import org.pdfbox.persistence.util.COSObjectKey;
/**
* This class is used to contain parsing logic that will be used by both the
* PDFParser and the COSStreamParser.
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.59 $
*/
public abstract class BaseParser
{
/**
* This is a byte array that will be used for comparisons.
*/
public static final byte[] ENDSTREAM =
new byte[] {101,110,100,115,116,114,101,97,109};//"endstream".getBytes( "ISO-8859-1" );
/**
* This is a byte array that will be used for comparisons.
*/
public static final String DEF = "def";
/**
* This is the stream that will be read from.
*/
//protected PushBackByteArrayStream pdfSource;
protected PushBackInputStream pdfSource;
/**
* moved xref here, is a persistence construct
* maybe not needed anyway when not read from behind with delayed
* access to objects.
*/
private List xrefs = new ArrayList();
private COSDocument document;
/**
* Constructor.
*
* @param input The input stream to read the data from.
*
* @throws IOException If there is an error reading the input stream.
*/
public BaseParser( InputStream input) throws IOException
{
//pdfSource = new PushBackByteArrayStream( input );
pdfSource = new PushBackInputStream( new BufferedInputStream( input, 16384 ), 4096 );
}
/**
* Constructor.
*
* @param input The array to read the data from.
*
* @throws IOException If there is an error reading the byte data.
*/
protected BaseParser(byte[] input) throws IOException
{
pdfSource = new ByteArrayPushBackInputStream(input);
}
/**
* Set the document for this stream.
*
* @param doc The current document.
*/
public void setDocument( COSDocument doc )
{
document = doc;
}
private static boolean isHexDigit(char ch)
{
return (ch >= '0' && ch <= '9') ||
(ch >= 'a' && ch <= 'f') ||
(ch >= 'A' && ch <= 'F');
// the line below can lead to problems with certain versions of the IBM JIT compiler
// (and is slower anyway)
//return (HEXDIGITS.indexOf(ch) != -1);
}
/**
* This will parse a PDF dictionary value.
*
* @return The parsed Dictionary object.
*
* @throws IOException If there is an error parsing the dictionary object.
*/
private COSBase parseCOSDictionaryValue() throws IOException
{
COSBase retval = null;
COSBase number = parseDirObject();
skipSpaces();
char next = (char)pdfSource.peek();
if( next >= '0' && next <= '9' )
{
COSBase generationNumber = parseDirObject();
skipSpaces();
char r = (char)pdfSource.read();
if( r != 'R' )
{
throw new IOException( "expected='R' actual='" + r + "' " + pdfSource );
}
COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
((COSInteger) generationNumber).intValue());
retval = document.getObjectFromPool(key);
}
else
{
retval = number;
}
return retval;
}
/**
* This will parse a PDF dictionary.
*
* @return The parsed dictionary.
*
* @throws IOException IF there is an error reading the stream.
*/
protected COSDictionary parseCOSDictionary() throws IOException
{
char c = (char)pdfSource.read();
if( c != '<')
{
throw new IOException( "expected='<' actual='" + c + "'" );
}
c = (char)pdfSource.read();
if( c != '<')
{
throw new IOException( "expected='<' actual='" + c + "' " + pdfSource );
}
skipSpaces();
COSDictionary obj = new COSDictionary();
boolean done = false;
while( !done )
{
skipSpaces();
c = (char)pdfSource.peek();
if( c == '>')
{
done = true;
}
else
{
COSName key = parseCOSName();
COSBase value = parseCOSDictionaryValue();
skipSpaces();
if( ((char)pdfSource.peek()) == 'd' )
{
//if the next string is 'def' then we are parsing a cmap stream
//and want to ignore it, otherwise throw an exception.
String potentialDEF = readString();
if( !potentialDEF.equals( DEF ) )
{
pdfSource.unread( potentialDEF.getBytes() );
}
else
{
skipSpaces();
}
}
if( value == null )
{
throw new IOException("Bad Dictionary Declaration " + pdfSource );
}
obj.setItem( key, value );
}
}
char ch = (char)pdfSource.read();
if( ch != '>' )
{
throw new IOException( "expected='>' actual='" + ch + "'" );
}
ch = (char)pdfSource.read();
if( ch != '>' )
{
throw new IOException( "expected='>' actual='" + ch + "'" );
}
return obj;
}
/**
* This will read a COSStream from the input stream.
*
* @param file The file to write the stream to when reading.
* @param dic The dictionary that goes with this stream.
*
* @return The parsed pdf stream.
*
* @throws IOException If there is an error reading the stream.
*/
protected COSStream parseCOSStream( COSDictionary dic, RandomAccess file ) throws IOException
{
COSStream stream = new COSStream( dic, file );
OutputStream out = null;
try
{
String streamString = readString();
//long streamLength;
if (!streamString.equals("stream"))
{
throw new IOException("expected='stream' actual='" + streamString + "'");
}
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
int whitespace = pdfSource.read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (whitespace == 0x20)
{
whitespace = pdfSource.read();
}
if( whitespace == 0x0D )
{
whitespace = pdfSource.read();
if( whitespace != 0x0A )
{
pdfSource.unread( whitespace );
//The spec says this is invalid but it happens in the real
//world so we must support it.
//throw new IOException("expected='0x0A' actual='0x" +
// Integer.toHexString(whitespace) + "' " + pdfSource);
}
}
else if (whitespace == 0x0A)
{
//that is fine
}
else
{
//we are in an error.
//but again we will do a lenient parsing and just assume that everything
//is fine
pdfSource.unread( whitespace );
//throw new IOException("expected='0x0D or 0x0A' actual='0x" +
//Integer.toHexString(whitespace) + "' " + pdfSource);
}
COSBase streamLength = dic.getDictionaryObject(COSName.LENGTH);
/*long length = -1;
if( streamLength instanceof COSNumber )
{
length = ((COSNumber)streamLength).intValue();
}
else if( streamLength instanceof COSObject &&
((COSObject)streamLength).getObject() instanceof COSNumber )
{
length = ((COSNumber)((COSObject)streamLength).getObject()).intValue();
}*/
//length = -1;
//streamLength = null;
//Need to keep track of the
out = stream.createFilteredStream( streamLength );
String endStream = null;
//the length is wrong in some pdf documents which means
//that PDFBox must basically ignore it in order to be able to read
//the most number of PDF documents. This of course is a penalty hit,
//maybe I could implement a faster parser.
/**if( length != -1 )
{
byte[] buffer = new byte[1024];
int amountRead = 0;
int totalAmountRead = 0;
while( amountRead != -1 && totalAmountRead < length )
{
int maxAmountToRead = Math.min(buffer.length, (int)(length-totalAmountRead));
amountRead = pdfSource.read(buffer,0,maxAmountToRead);
totalAmountRead += amountRead;
if( amountRead != -1 )
{
out.write( buffer, 0, amountRead );
}
}
}
else
{**/
readUntilEndStream( out );
/**}*/
skipSpaces();
endStream = readString();
if (!endStream.equals("endstream"))
{
readUntilEndStream( out );
endStream = readString();
if( !endStream.equals( "endstream" ) )
{
throw new IOException("expected='endstream' actual='" + endStream + "' " + pdfSource);
}
}
}
finally
{
if( out != null )
{
out.close();
}
}
return stream;
}
private void readUntilEndStream( OutputStream out ) throws IOException
{
int currentIndex = 0;
int byteRead = 0;
//this is the additional bytes buffered but not written
int additionalBytes=0;
byte[] buffer = new byte[ENDSTREAM.length+additionalBytes];
int writeIndex = 0;
while(!cmpCircularBuffer( buffer, currentIndex, ENDSTREAM ) && byteRead != -1 )
{
writeIndex = currentIndex - buffer.length;
if( writeIndex >= 0 )
{
out.write( buffer[writeIndex%buffer.length] );
}
byteRead = pdfSource.read();
buffer[currentIndex%buffer.length] = (byte)byteRead;
currentIndex++;
}
//we want to ignore the end of the line data when reading a stream
//so will make an attempt to ignore it.
/*writeIndex = currentIndex - buffer.length;
if( buffer[writeIndex%buffer.length] == 13 &&
buffer[(writeIndex+1)%buffer.length] == 10 )
{
//then ignore the newline before the endstream
}
else if( buffer[(writeIndex+1)%buffer.length] == 10 )
{
//Then first byte is data, second byte is newline
out.write( buffer[writeIndex%buffer.length] );
}
else
{
out.write( buffer[writeIndex%buffer.length] );
out.write( buffer[(writeIndex+1)%buffer.length] );
}*/
/**
* Old way of handling newlines before endstream
for( int i=0; i<additionalBytes; i++ )
{
writeIndex = currentIndex - buffer.length;
if( writeIndex >=0 &&
//buffer[writeIndex%buffer.length] != 10 &&
buffer[writeIndex%buffer.length] != 13 )
{
out.write( buffer[writeIndex%buffer.length] );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -