📄 pdfreader.cs
字号:
{
indexes[i] = (int)((PdfNumber)array[i]).Number;
}
return indexes;
}
}
}
/// <summary>
/// Represents an AcroForm object in a PDF document. See the PDF reference 8.6.1 Interactive Form Dictionary.
/// </summary>
[CLSCompliant(true)]
public class PdfForm: PdfField
{
private static PdfName NAName = new PdfName("/NeedAppearances");
/// <summary>
/// Initializes a new instance of PdfForm with the specified object number, generation number,
/// and field dictionary.
/// </summary>
/// <param name="objNumber">The object number.</param>
/// <param name="generationNumber">The generation number.</param>
/// <param name="fieldDictionary">The field dictionary.</param>
public PdfForm(int objNumber, int generationNumber, PdfDictionary fieldDictionary): base(objNumber,
generationNumber, fieldDictionary)
{
// set NeedAppearances key so the viewer application regenerates the appearance streams
// for the form fields.
FieldDictionary.SetElement(NAName, new PdfBool(true));
}
}
/// <summary>
/// Represents an entry in the PDF Cross Reference table.
/// See the PDF Reference 3.4.3 Cross-Reference Table.
/// </summary>
[CLSCompliant(true)]
public class PdfCrossReferenceEntry
{
private int objectNumber;
private int generationNumber;
private int offset;
private bool active;
/// <summary>
/// The object number.
/// </summary>
public int ObjectNumber
{
get
{
return objectNumber;
}
set
{
objectNumber = value;
}
}
/// <summary>
/// The generation number.
/// </summary>
public int GenerationNumber
{
get
{
return generationNumber;
}
set
{
generationNumber = value;
}
}
/// <summary>
/// The byte offset of the object within the document.
/// </summary>
public int Offset
{
get
{
return offset;
}
set
{
offset = value;
}
}
/// <summary>
/// true if the object is not free, false otherwise.
/// </summary>
public bool Active
{
get
{
return active;
}
set
{
active = value;
}
}
/// <summary>
/// Initializes a new PdfCrossReferenceEntry object.
/// </summary>
/// <param name="objNumber">The object number</param>
/// <param name="generationNumber">The generation number</param>
/// <param name="offset">The byte offset within the PDF file</param>
/// <param name="active">true if the object is not free, false otherwise</param>
public PdfCrossReferenceEntry(int objNumber, int generationNumber,
int offset, bool active)
{
ObjectNumber = objNumber;
GenerationNumber = generationNumber;
Offset = offset;
Active = active;
}
}
/// <summary>
/// Parses PDF files for interactive form fields and allows to get and set the value of those fields.
/// </summary>
/// <remarks>
/// The PDF parser in PdfReader is very simple and suffices only for basic cases.
/// PdfReader is intended to easily programmatically fill out PDF forms.
/// It parses the PDF in one go and allows the user to make changes to the form field values.
/// It writes an updated version of the PDF to a Stream, which (hopefully) conforms to the PDF Reference.
/// If you have advanced parsing needs, you are probably better off with a package such as <see href="http://www.lowagie.com/iText/">iText</see>.
/// </remarks>
/// <example>
/// Read a PDF file, change one text field, write the updated file back out:
/// <code>
/// // read the file
/// PdfReader reader = new PdfReader(infile);
///
/// // change one text field
/// try
/// {
/// ((PdfTXField)reader.FieldsByName["Name"]).Text = "Doe";
/// }
/// catch
/// {
/// }
///
/// // write the file
/// FileStream fileStream = new FileStream(file, System.IO.FileMode.Create);
/// reader.WritePdf(fileStream);
/// fileStream.Close();
/// </code>
/// </example>
[CLSCompliant(true)]
public class PdfReader
{
private string pdf;
private PdfField[] fields = new PdfField[0];
private Hashtable fieldsByName = new Hashtable();
private PdfForm form;
private static readonly Regex xrefRegex = new Regex(@"startxref\s*(\d+)\s*%%EOF", RegexOptions.Singleline);
private static readonly Regex trailerRegex = new Regex(@"trailer\s*<<(.*?)>>", RegexOptions.Singleline);
private static readonly Regex rootRegex = new Regex(@"/Root\s*(\d+)\s+(\d+)\s*R", RegexOptions.Singleline);
private static readonly Regex sizeRegex = new Regex(@"/Size\s*(\d+)", RegexOptions.Singleline);
private static readonly Regex nullRegex = new Regex(@"\sxref\s+0\s+\d+\s+(\d+)", RegexOptions.Singleline);
private static readonly Regex refRegex = new Regex(@"\s*((\d+)\s+(\d+))?\s*(\d{10})\s+(\d{5})\s+(n|f)", RegexOptions.Singleline);
private static readonly Regex objectRegex = new Regex(@"(\d+)\s+(\d+)\s+obj(.*?)endobj", RegexOptions.Singleline);
private static readonly Regex fieldRegex = new Regex(@"^\s*<<(.*?/FT\s+/(Btn|Tx|Ch).*>>)", RegexOptions.Singleline);
private static readonly Regex formRegex = new Regex(@"^\s*<<(.*?/Fields\s+\[.*>>)", RegexOptions.Singleline);
private static readonly Regex acroFormRegex = new Regex(@"/AcroForm\s+(\d+)\s+(\d+)", RegexOptions.Singleline);
private static readonly Regex trailRegex = new Regex(@"trailer\s+<<(.*>>)(\s+startxref\s+(\d+))?", RegexOptions.Singleline);
private static readonly Regex objRegex = new Regex(@"(\d+)\s+(\d+)\s+obj", RegexOptions.Singleline);
private static readonly Regex linearizedRegex = new Regex(@"/Linearized\s+1", RegexOptions.Singleline);
private int previous = -1; // location of the previous cross-reference table
private PdfDictionary previousTrailer;
private int rootObjectNumber;
private int nullOffset;
private bool linearized = false;
private Hashtable objects = new Hashtable();
private SortedList offsets = new SortedList();
/// <summary>
/// Initializes a new instance of PdfReader with the specified file.
/// </summary>
/// <param name="name">The file containing the PDF data.</param>
public static PdfReader GetPdfReader(string name)
{
PdfReader reader;
using (FileStream stream = new FileStream(name, FileMode.Open))
{
reader = new PdfReader(stream);
}
return reader;
}
/// <summary>
/// Initializes a new instance of PdfReader with the specified Stream.
/// </summary>
/// <param name="stream">The Stream containing the PDF data.</param>
public PdfReader(Stream stream)
{
DOMConfigurator.ConfigureAndWatch(new FileInfo("PdfReader.exe.log4net"));
byte[] buffer = new byte[stream.Length];
stream.Read(buffer, 0, (int)stream.Length);
char[] chars = new char[buffer.Length];
for (int i = 0; i < buffer.Length; i++)
{
chars[i] = (char)buffer[i];
}
pdf = new String(chars);
Parse();
}
/// <summary>
/// Returns the PDF object referenced by the specified PDF reference.
/// </summary>
/// <param name="reference">The reference to the object.</param>
/// <returns>The PDF object referenced.</returns>
public PdfObject GetObjectForReference(PdfReference reference)
{
PdfObject PdfObject = null;
// is the object active?
if (objects.Contains(reference.ObjectNumber))
{
PdfCrossReferenceEntry entry = (PdfCrossReferenceEntry)objects[reference.ObjectNumber];
if (entry.Active)
{
int start = entry.Offset;
int end = GetEndOfObject(reference.ObjectNumber);
PdfObject = ParseObject(start, end);
}
}
return PdfObject;
}
/// <summary>
/// Gets or sets a value indicating whether the file is linearized.
/// Refer to the PDF Reference, Appendix F.
/// </summary>
public bool Linearized
{
get
{
return linearized;
}
set
{
linearized = value;
}
}
private void Parse()
{
ArrayList fieldsList = new ArrayList();
int startTrailer;
int endPosition = pdf.Length - 1;
int startxref;
PdfName prevName = new PdfName("/Prev");
PdfName sizeName = new PdfName("/Size");
PdfName rootName = new PdfName("/Root");
Match match;
// hack to determine if PDF is linearized:
// check for the occurence of "/Linearized 1" before the first
// occurence of "endobj"
match = linearizedRegex.Match(pdf, 0, pdf.IndexOf("endobj"));
if (match.Success)
{
Linearized = true;
}
// succesively parse all trailers starting from the end
while ((startTrailer = pdf.LastIndexOf("trailer", endPosition)) >= 0)
{
match = trailRegex.Match(pdf, startTrailer, endPosition - startTrailer + 1);
if (match.Success)
{
string trailerString = match.Groups[1].Value;
PdfDictionary trailerDictionary = new PdfDictionary(ref trailerString);
startxref = -1;
// in a hybrid-reference file (PDF 1.5), the trailer doesn't seem to
// always include a startxref.
if (match.Groups[2].Success)
{
startxref = Int32.Parse(match.Groups[3].Value, CultureInfo.InvariantCulture);
}
if (previous == -1)
{
previous = startxref;
}
// we don't believe the startxref field.
// it can be bogus due to linearization.
// we'll keep the "previous" value from above though, since
// that's what taft@adobe.com says in a 1998 post on comp.text.pdf :-)
startxref = pdf.LastIndexOf("xref", startTrailer);
if (previousTrailer == null || Linearized)
{
previousTrailer = trailerDictionary;
}
// parse the xref table that is referenced by this trailer
ParseXRef(startxref);
// the offset of the first object marks the start of the body and is therefore
// the end of the previous trailer
if (offsets.Count > 0)
{
endPosition = ((PdfCrossReferenceEntry)offsets.GetByIndex(0)).Offset;
}
else
{
// if the xref table has no objects then the previous trailer must
// be right in front of it
endPosition = startxref;
}
}
}
rootObjectNumber = ((PdfReference)previousTrailer["/Root"]).ObjectNumber;
ParseAcroForm();
if (form != null)
{
PdfObject fieldsObject = form.FieldDictionary["/Fields"];
PdfArray fieldsArray = fieldsObject as PdfArray;
if (fieldsArray == null)
{
PdfReference fieldsReference = fieldsObject as PdfReference;
fieldsArray = (PdfArray)GetObjectForReference(fieldsReference);
}
foreach (PdfReference fieldReference in fieldsArray.Elements)
{
PdfField[] fields = PdfField.GetPdfFields(fieldReference, this, null, "");
foreach (PdfField field in fields)
{
Trace.Assert(field != null);
fieldsList.Add(field);
// make field name unique
string fieldName = field.Name;
int i = 0;
while (fieldsByName.Contains(fieldName))
{
fieldName = string.Format(CultureInfo.InvariantCulture, "{0}[{1}]", field.Name, i);
i++;
}
fieldsByName.Add(fieldName, field);
}
}
this.fields = (PdfField[])fieldsList.ToArray(typeof(PdfField));
}
}
private void ParseXRef(int startxref)
{
int objNumber = 0;
int generationNumber;
int offset;
PdfCrossReferenceEntry entry;
int endOfXRef = pdf.IndexOf("trailer", startxref + 4);
string xRef = pdf.Substring(startxref + 4, endOfXRef - (startxref + 4) + 1);
MatchCollection refMatches = refRegex.Matches(xRef);
foreach (Match refMatch in refMatches)
{
if (!refMatch.Groups[2].Value.Equals(""))
{
objNumber = Int32.Parse(refMatch.Groups[2].Value, CultureInfo.InvariantCulture);
}
offset = Int32.Parse(refMatch.Groups[4].Value, CultureInfo.InvariantCulture);
generationNumber = Int32.Parse(refMatch.Groups[5].Value, CultureInfo.InvariantCulture);
if (refMatch.Groups[6].Value.Equals("n"))
{
entry = new PdfCrossReferenceEntry(objNumber, generationNumber, offset,
refMatch.Groups[6].Value.Equals("n"));
if (!objects.Contains(objNumber))
{
offsets.Add(offset, entry);
objects.Add(objNumber, entry);
}
}
else if (objNumber == 0)
{
// special case:
// in order to build a new xref table we need the first free object number
nullOffset = offset;
}
objNumber++;
}
}
/// <summary>
/// Gets the end offset of the specified object.
/// The offset is determined by the beginning offset of the object with next higher start offset.
/// If the object is the last object, -1 is returned.
/// If the document was updated, the offset may be after the xref table and trailer that
/// follow the specified object.
/// </summary>
/// <param name="objNumber">The object number to find the offset for.</param>
/// <returns>The end offset of the specified object or -1 if it is the last object</returns>
private int GetEndOfObject(int objNumber)
{
PdfCrossReferenceEntry theObject = (PdfCrossReferenceEntry)objects[objNumber];
int objectIndex = offsets.IndexOfKey(theObject.Offset);
if ((objectIndex + 1) < offsets.Count)
{
PdfCrossReferenceEntry nextObject = (PdfCrossReferenceEntry)offsets.GetByIndex(objectIndex + 1);
return nextObject.Offset;
}
else
{
return -1;
}
}
/// <summary>
/// Parses the Interactive Form Dictionary referenced by the AcroForm entry in
/// the Document Catalog.
/// </summary>
private void ParseAcroForm()
{
PdfCrossReferenceEntry documentCatalog = (PdfCrossReferenceEntry)objects[rootObjectNumber];
int documentCatalogStart = documentCatalog.Offset;
int documentCatalogEnd = GetEndOfObject(documentCatalog.ObjectN
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -