📄 pdftextextractor.java
字号:
package com.rsc.rs.util.xml;import com.etymon.pj.Pdf;import com.etymon.pj.exception.InvalidPdfObjectException;import com.etymon.pj.exception.PjException;import com.etymon.pj.object.PjArray;import com.etymon.pj.object.PjObject;import com.etymon.pj.object.PjPage;import com.etymon.pj.object.PjStream;import org.apache.log4j.Category;import java.io.File;import java.io.IOException;import java.util.Vector;/** * <p> * Attempts to extract text from a PDF file. * </p> * <p> * <a href="http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00280.html"> * Known limitations</a> * </p> * * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a> * @version $Revision: 1.1 $ */public class PdfTextExtractor{ private static Category cat = Category.getInstance(PdfTextExtractor.class); public static void main(String[] args) { File f = new File("/usr/local/test.pdf"); try { Pdf pdf = new Pdf(f.toString()); int pagecount = pdf.getPageCount(); cat.debug(f.toString() + "has " + pagecount + " pages."); for (int i = 1; i <= pagecount; i++) { System.out.println(getContent(pdf, i)); } } catch (IOException ioe) { cat.error("IOException parsing PDF file:" + f.toString(), ioe); } catch (PjException pje) { cat.error("PjException parsing PDF file:" + f.toString(), pje); } } private static String getContent(Pdf pdf, int pageNo) { String content = null; PjStream stream = null; StringBuffer strbf = new StringBuffer(); try { PjPage page = (PjPage) pdf.getObject(pdf.getPage(pageNo)); PjObject pobj = (PjObject) pdf.resolve(page.getContents()); if (pobj instanceof PjArray) { PjArray array = (PjArray) pobj; Vector vArray = array.getVector(); int size = vArray.size(); for (int j = 0; j < size; j++) { stream = (PjStream) pdf.resolve((PjObject) vArray.get(j)); strbf.append(getStringFromPjStream(stream)); } content = strbf.toString(); } else { stream = (PjStream) pobj; content = getStringFromPjStream(stream); } } catch (InvalidPdfObjectException pdfe) { cat.error("Invalid PDF Object:" + pdfe, pdfe); } catch (Exception e) { cat.error("Exception in getContent() " + e, e); } return content; } private static String getStringFromPjStream(PjStream stream) { StringBuffer strbf = new StringBuffer(); try { int start,end = 0; stream = stream.flateDecompress(); String longString = stream.toString(); int strlen = longString.length(); int lastIndex = longString.lastIndexOf(')'); while (lastIndex != -1 && end != lastIndex) { start = longString.indexOf('(', end); end = longString.indexOf(')', start); String text = longString.substring(start + 1, end); strbf.append(text); } } catch (InvalidPdfObjectException pdfe) { cat.error("InvalidObjectException:" + pdfe.getMessage(), pdfe); } return strbf.toString(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -