pdftotext.cc

来自「source code: Covert TXT to PDF」· CC 代码 · 共 327 行

CC
327
字号
//========================================================================//// pdftotext.cc//// Copyright 1997-2002 Glyph & Cog, LLC////========================================================================#include <aconf.h>#include <stdio.h>#include <stdlib.h>#include <stddef.h>#include <string.h>#include "parseargs.h"#include "GString.h"#include "gmem.h"#include "GlobalParams.h"#include "Object.h"#include "Stream.h"#include "Array.h"#include "Dict.h"#include "XRef.h"#include "Catalog.h"#include "Page.h"#include "PDFDoc.h"#include "TextOutputDev.h"#include "CharTypes.h"#include "UnicodeMap.h"#include "Error.h"#include "config.h"static void printInfoString(FILE *f, Dict *infoDict, char *key,			    char *text1, char *text2, UnicodeMap *uMap);static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);static int firstPage = 1;static int lastPage = 0;static GBool physLayout = gFalse;static GBool rawOrder = gFalse;static GBool htmlMeta = gFalse;static char textEncName[128] = "";static char textEOL[16] = "";static char ownerPassword[33] = "";static char userPassword[33] = "";static GBool quiet = gFalse;static char cfgFileName[256] = "";static GBool printVersion = gFalse;static GBool printHelp = gFalse;static ArgDesc argDesc[] = {  {"-f",      argInt,      &firstPage,     0,   "first page to convert"},  {"-l",      argInt,      &lastPage,      0,   "last page to convert"},  {"-layout", argFlag,     &physLayout,    0,   "maintain original physical layout"},  {"-raw",    argFlag,     &rawOrder,      0,   "keep strings in content stream order"},  {"-htmlmeta", argFlag,   &htmlMeta,      0,   "generate a simple HTML file, including the meta information"},  {"-enc",    argString,   textEncName,    sizeof(textEncName),   "output text encoding name"},  {"-eol",    argString,   textEOL,        sizeof(textEOL),   "output end-of-line convention (unix, dos, or mac)"},  {"-opw",    argString,   ownerPassword,  sizeof(ownerPassword),   "owner password (for encrypted files)"},  {"-upw",    argString,   userPassword,   sizeof(userPassword),   "user password (for encrypted files)"},  {"-q",      argFlag,     &quiet,         0,   "don't print any messages or errors"},  {"-cfg",        argString,      cfgFileName,    sizeof(cfgFileName),   "configuration file to use in place of .xpdfrc"},  {"-v",      argFlag,     &printVersion,  0,   "print copyright and version info"},  {"-h",      argFlag,     &printHelp,     0,   "print usage information"},  {"-help",   argFlag,     &printHelp,     0,   "print usage information"},  {"--help",  argFlag,     &printHelp,     0,   "print usage information"},  {"-?",      argFlag,     &printHelp,     0,   "print usage information"},  {NULL}};int main(int argc, char *argv[]) {  PDFDoc *doc;  GString *fileName;  GString *textFileName;  GString *ownerPW, *userPW;  TextOutputDev *textOut;  FILE *f;  UnicodeMap *uMap;  Object info;  GBool ok;  char *p;  int exitCode;  exitCode = 99;  // parse args  ok = parseArgs(argDesc, &argc, argv);  if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {    fprintf(stderr, "pdftotext version %s\n", xpdfVersion);    fprintf(stderr, "%s\n", xpdfCopyright);    if (!printVersion) {      printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);    }    goto err0;  }  fileName = new GString(argv[1]);  // read config file  globalParams = new GlobalParams(cfgFileName);  if (textEncName[0]) {    globalParams->setTextEncoding(textEncName);  }  if (textEOL[0]) {    if (!globalParams->setTextEOL(textEOL)) {      fprintf(stderr, "Bad '-eol' value on command line\n");    }  }  if (quiet) {    globalParams->setErrQuiet(quiet);  }  // get mapping to output encoding  if (!(uMap = globalParams->getTextEncoding())) {    error(-1, "Couldn't get text encoding");    delete fileName;    goto err1;  }  // open PDF file  if (ownerPassword[0]) {    ownerPW = new GString(ownerPassword);  } else {    ownerPW = NULL;  }  if (userPassword[0]) {    userPW = new GString(userPassword);  } else {    userPW = NULL;  }  doc = new PDFDoc(fileName, ownerPW, userPW);  if (userPW) {    delete userPW;  }  if (ownerPW) {    delete ownerPW;  }  if (!doc->isOk()) {    exitCode = 1;    goto err2;  }  // check for copy permission  if (!doc->okToCopy()) {    error(-1, "Copying of text from this document is not allowed.");    exitCode = 3;    goto err2;  }  // construct text file name  if (argc == 3) {    textFileName = new GString(argv[2]);  } else {    p = fileName->getCString() + fileName->getLength() - 4;    if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {      textFileName = new GString(fileName->getCString(),				 fileName->getLength() - 4);    } else {      textFileName = fileName->copy();    }    textFileName->append(htmlMeta ? ".html" : ".txt");  }  // get page range  if (firstPage < 1) {    firstPage = 1;  }  if (lastPage < 1 || lastPage > doc->getNumPages()) {    lastPage = doc->getNumPages();  }  // write HTML header  if (htmlMeta) {    if (!textFileName->cmp("-")) {      f = stdout;    } else {      if (!(f = fopen(textFileName->getCString(), "wb"))) {	error(-1, "Couldn't open text file '%s'", textFileName->getCString());	exitCode = 2;	goto err3;      }    }    fputs("<html>\n", f);    fputs("<head>\n", f);    doc->getDocInfo(&info);    if (info.isDict()) {      printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",		      uMap);      printInfoString(f, info.getDict(), "Subject",		      "<meta name=\"Subject\" content=\"", "\">\n", uMap);      printInfoString(f, info.getDict(), "Keywords",		      "<meta name=\"Keywords\" content=\"", "\">\n", uMap);      printInfoString(f, info.getDict(), "Author",		      "<meta name=\"Author\" content=\"", "\">\n", uMap);      printInfoString(f, info.getDict(), "Creator",		      "<meta name=\"Creator\" content=\"", "\">\n", uMap);      printInfoString(f, info.getDict(), "Producer",		      "<meta name=\"Producer\" content=\"", "\">\n", uMap);      printInfoDate(f, info.getDict(), "CreationDate",		    "<meta name=\"CreationDate\" content=\"\">\n");      printInfoDate(f, info.getDict(), "LastModifiedDate",		    "<meta name=\"ModDate\" content=\"\">\n");    }    info.free();    fputs("</head>\n", f);    fputs("<body>\n", f);    fputs("<pre>\n", f);    if (f != stdout) {      fclose(f);    }  }  // write text file  textOut = new TextOutputDev(textFileName->getCString(),			      physLayout, rawOrder, htmlMeta);  if (textOut->isOk()) {    doc->displayPages(textOut, firstPage, lastPage, 72, 0, gFalse);  } else {    delete textOut;    exitCode = 2;    goto err3;  }  delete textOut;  // write end of HTML file  if (htmlMeta) {    if (!textFileName->cmp("-")) {      f = stdout;    } else {      if (!(f = fopen(textFileName->getCString(), "ab"))) {	error(-1, "Couldn't open text file '%s'", textFileName->getCString());	exitCode = 2;	goto err3;      }    }    fputs("</pre>\n", f);    fputs("</body>\n", f);    fputs("</html>\n", f);    if (f != stdout) {      fclose(f);    }  }  exitCode = 0;  // clean up err3:  delete textFileName; err2:  delete doc;  uMap->decRefCnt(); err1:  delete globalParams; err0:  // check for memory leaks  Object::memCheck(stderr);  gMemReport(stderr);  return exitCode;}static void printInfoString(FILE *f, Dict *infoDict, char *key,			    char *text1, char *text2, UnicodeMap *uMap) {  Object obj;  GString *s1;  GBool isUnicode;  Unicode u;  char buf[8];  int i, n;  if (infoDict->lookup(key, &obj)->isString()) {    fputs(text1, f);    s1 = obj.getString();    if ((s1->getChar(0) & 0xff) == 0xfe &&	(s1->getChar(1) & 0xff) == 0xff) {      isUnicode = gTrue;      i = 2;    } else {      isUnicode = gFalse;      i = 0;    }    while (i < obj.getString()->getLength()) {      if (isUnicode) {	u = ((s1->getChar(i) & 0xff) << 8) |	    (s1->getChar(i+1) & 0xff);	i += 2;      } else {	u = s1->getChar(i) & 0xff;	++i;      }      n = uMap->mapUnicode(u, buf, sizeof(buf));      fwrite(buf, 1, n, f);    }    fputs(text2, f);  }  obj.free();}static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) {  Object obj;  char *s;  if (infoDict->lookup(key, &obj)->isString()) {    s = obj.getString()->getCString();    if (s[0] == 'D' && s[1] == ':') {      s += 2;    }    fprintf(f, fmt, s);  }  obj.free();}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?