📄 pdfparser.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Created on Jul 14, 2003 * */package org.archive.crawler.extractor;import com.lowagie.text.pdf.PdfReader;import com.lowagie.text.pdf.PdfName;import com.lowagie.text.pdf.PdfObject;import com.lowagie.text.pdf.PdfDictionary;import com.lowagie.text.pdf.PRIndirectReference;import com.lowagie.text.pdf.PdfArray;import java.io.*;import java.util.*;/** Supports PDF parsing operations. For now this primarily means * extracting URIs, but the logic in extractURIs() could easily be adopted/extended * for a variety of PDF processing tasks. * * @author Parker Thompson * *///TODO make this more effecient, it currently had to read the whole file into memory// before processing can begin, and appears to take much longer than it "should"// to parse small, but admittedly complex, documents.public class PDFParser { ArrayList<String> foundURIs; ArrayList<ArrayList<Integer>> encounteredReferences; PdfReader documentReader; byte[] document; PdfDictionary catalog; public PDFParser(String doc) throws IOException { resetState(); getInFromFile(doc); initialize(); } public PDFParser(byte[] doc) throws IOException{ resetState(); document = doc; initialize(); } /** Reinitialize the object as though a new one were created. */ protected void resetState(){ foundURIs = new ArrayList<String>(); encounteredReferences = new ArrayList<ArrayList<Integer>>(); documentReader = null; document = null; catalog = null; for(int i=0; i < encounteredReferences.size(); i++){ encounteredReferences.add(new ArrayList<Integer>()); } } /** * Reset the object and initialize it with a new byte array (the document). * @param doc * @throws IOException */ public void resetState(byte[] doc) throws IOException{ resetState(); document = doc; initialize(); } /** Reinitialize the object as though a new one were created, complete * with a valid pointer to a document that can be read * @param doc * @throws IOException */ public void resetState(String doc) throws IOException{ resetState(); getInFromFile(doc); initialize(); } /** * Read a file named 'doc' and store its' bytes for later processing. * @param doc * @throws IOException */ protected void getInFromFile(String doc) throws IOException{ File documentOnDisk = new File(doc); long length = documentOnDisk.length(); document = new byte[(int)length]; FileInputStream inStream = new FileInputStream(documentOnDisk); inStream.read(document); } /** * Indicates, based on a PDFObject's generation/id pair whether * the parser has already encountered this object (or a reference to it) * so we don't infinitely loop on circuits within the PDF. * @param generation * @param id * @return True if already seen. */ protected boolean haveSeen(int generation, int id){ // if we can't store this generation grow our list until we can if(generation >= encounteredReferences.size()){ for(int i=encounteredReferences.size(); i <= generation; i++){ encounteredReferences.add(new ArrayList<Integer>()); } // clearly we haven't seen it return false; } ArrayList<Integer> generationList = encounteredReferences.get(generation); for (int i: generationList) { if(i == id){ return true; } } return false; } /** * Note that an object (id/generation pair) has been seen by this parser * so that it can be handled differently when it is encountered again. * @param generation * @param id */ protected void markAsSeen(int generation, int id){ ArrayList<Integer> objectIds = encounteredReferences.get(generation); objectIds.add(id); } /** * Get a list of URIs retrieved from the Pdf during the * extractURIs operation. * @return A list of URIs retrieved from the Pdf during the * extractURIs operation. */ public ArrayList getURIs(){ return foundURIs; } /** * Initialize opens the document for reading. This is done implicitly * by the constuctor. This should only need to be called directly following * a reset. * @throws IOException */ protected void initialize() throws IOException{ if(document != null){ documentReader = new PdfReader(document); } catalog = documentReader.getCatalog(); } /** * Extract URIs from all objects found in a Pdf document's catalog. * Returns an array list representing all URIs found in the document catalog tree. * @return URIs from all objects found in a Pdf document's catalog. */ public ArrayList extractURIs(){ extractURIs(catalog); return getURIs(); } /** * Parse a PdfDictionary, looking for URIs recursively and adding * them to foundURIs * @param entity */ protected void extractURIs(PdfObject entity){ // deal with dictionaries if(entity.isDictionary()){ PdfDictionary dictionary= (PdfDictionary)entity; @SuppressWarnings("unchecked") Set<PdfName> allkeys = dictionary.getKeys(); for (PdfName key: allkeys) { PdfObject value = dictionary.get(key); // see if it's the key is a UR[I,L] if( key.toString().equals("/URI") || key.toString().equals("/URL") ) { foundURIs.add(value.toString()); }else{ this.extractURIs(value); } } // deal with arrays }else if(entity.isArray()){ PdfArray array = (PdfArray)entity; ArrayList arrayObjects = array.getArrayList(); Iterator objectList = arrayObjects.iterator(); while(objectList.hasNext()){ this.extractURIs( (PdfObject)objectList.next()); } // deal with indirect references }else if(entity.getClass() == PRIndirectReference.class){ PRIndirectReference indirect = (PRIndirectReference)entity; // if we've already seen a reference to this object if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){ return; // note that we've seen it if it's new }else{ markAsSeen(indirect.getGeneration(), indirect.getNumber() ); } // dereference the "pointer" and process the object indirect.getReader(); // FIXME: examine side-effects PdfObject direct = PdfReader.getPdfObject(indirect); this.extractURIs(direct); } } public static void main(String[] argv){ try{ PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf"); ArrayList uris = parser.extractURIs(); Iterator i = uris.iterator(); while(i.hasNext()){ String uri = (String)i.next(); System.out.println("got uri: " + uri); } }catch(IOException e){ e.printStackTrace(); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -