📄 ptotdll.c
字号:
/* Copyright (C) 1995-1998, Digital Equipment Corporation. *//* All rights reserved. *//* See the file pstotext.txt for a full description. *//* Last modified on Fri Jan 09 21:19:00 AEST 2004 by rjl *//* modified on Fri Jan 09 08:21:00 AEST 2004 by rjl *//* modified on Wed Oct 28 08:42:15 PST 1998 by mcjones *//* modified on Sun Jul 28 00:00:00 UTC 1996 by rjl *//* Modifications by rjl * Fixed compiler warnings *//* This module is based on OCR_PS.m3, a module of the Virtual Paper project at the DEC Systems Research Center: http://www.research.digital.com/SRC/virtualpaper/ */#include <math.h>#include <string.h>#include <stdlib.h>#include "ptotdll.h"#ifndef NULL#define NULL 0#endif#define BOOLEAN int#define FALSE 0#define TRUE 1#define MIN(a,b) ((a)<=(b)?(a):(b))#define MAX(a,b) ((b)<=(a)?(a):(b))/* Character encoding. Each element of the QE directive produced by ocr.ps is either an index in the StandardGlyph array, or is "NonstandardGlyph" (indicating the corresponding entry in the font's encoding specifies some nonstandard glyph). */typedef unsigned GlyphIndex;#define NonstandardGlyph 9999#define UnknownChar '#' /* substitute for nonstandard glyph *//* The first 256 entries in StandardGlyphs correspond to ISOLatin1; the next 28 entries correspond to characters not in ISOLatin1, but defined in the standard /Times-Roman font. */#define LastISOLatin1 255#define FIRSTSpecialGlyphs (LastISOLatin1+1)#define LASTSpecialGlyphs (LastISOLatin1+28)static const char *SpecialGlyphs[] = { "''", /* quotedblright */ "S\237", /* Scaron */ "+", /* dagger */ "<", /* guilsinglleft */ "Z\237", /* Zcaron */ "#", /* daggerdbl */ "L/", /* Lslash */ "...", /* ellipsis */ ">", /* guilsinglright */ "oe", /* oe */ "fi", /* fi */ ".", /* bullet */ "o/oo", /* perthousand */ "''", /* quotedblbase */ "--", /* endash */ "---", /* emdash */ "^TM", /* trademark */ "f", /* florin */ "l/", /* lslash */ "s\237", /* scaron */ "Y\250", /* Ydieresis */ "fl", /* fl */ "/", /* fraction */ "``", /* quotedblleft */ "'", /* quotesinglbase */ "'", /* quotesingle */ "z\237", /* zcaron */ "OE" /* OE */ };/* The next 256 entries correspond to the self-named glyphs used in Type 3 fonts from dvips: "\000", ..., "\377": */#define FirstDvips (LASTSpecialGlyphs+1)#define LastDvips (FirstDvips+256-1)/* The next 512 entries correspond to glyph names used in Microsoft TrueType fonts: "G00", ..., "Gff" and "G00", ..., "GFF", which in both cases correspond to ISOLatin1 with some extensions. */#define FirstTT1 (LastDvips+1)#define LastTT1 (FirstTT1+256-1)#define FirstTT2 (LastTT1+1)#define LastTT2 (FirstTT2+256-1)#define FirstOldDvips (LastTT2+1)#define LastOldDvips (FirstOldDvips+128-1) /* note only 128 */#define FIRSTTTSpecialGlyphs (FirstTT1+130)#define LASTTTSpecialGlyphs (FirstTT1+159)static const char *TTSpecialGlyphs[] = { "'", /* quotesinglbase */ "f", /* florin */ "''", /* quotdblbase */ "...", /* ellipsis */ "+", /* dagger */ "#", /* daggerdbl */ "\223", /* circumflex */ "o/oo", /* perthousand */ "S\237", /* Scaron */ "<", /* guilsinglleft */ "OE", /* OE */ "#", /* <undefined> */ "#", /* <undefined> */ "#", /* <undefined> */ "#", /* <undefined> */ "`", /* ISOLatin1: quoteleft */ "'", /* ISOLatin1: quoteright */ "``", /* quotedblleft */ "''", /* quotedblright */ ".", /* bullet */ "--", /* endash */ "---", /* emdash */ "~", /* ISOLatin1: tilde */ "^TM", /* trademark */ "s\237", /* scaron */ ">", /* guilsinglright */ "oe", /* oe */ "#", /* <undefined> */ "#", /* <undefined> */ "Y\250" /* Ydieresis" */ };#define FIRSTDvipsGlyphs FirstDvips#define LASTDvipsGlyphs (FirstDvips+127)static const char *DvipsGlyphs[] = { /* 00x */ "\\Gamma", "\\Delta", "\\Theta", "\\Lambda", "\\Xi", "\\Pi", "\\Sigma", "\\Upsilon", /* 01x */ "\\Phi", "\\Psi", "\\Omega", "ff", "fi", "fl", "ffi", "ffl", /* 02x */ "i", /* \imath */ "j", /* \jmath */ "`", "'", "\237", /* caron */ "\226", /* breve */ "\257", /* macron */ "\232", /* ring */ /* 03x */ "\270", /* cedilla */ "\337", /* germandbls */ "ae", "oe", "\370", /* oslash */ "AE", "OE", "\330", /* Oslash */ /* 04x */ "/" /* bar for Polish suppressed-L ??? */, "!", "''", "#", "$", "%", "&", "'", /* 05x */ "(", ")", "*", "+", ",", "\255" /* hyphen */, ".", "/", /* 06x */ "0", "1", "2", "3", "4", "5", "6", "7", /* 07x */ "8", "9", ":", ";", "!" /* exclamdown */, "=", "?" /* questiondown */, "?", /* 010x */ "@", "A", "B", "C", "D", "E", "F", "G", /* 011x */ "H", "I", "J", "K", "L", "M", "N", "O", /* 012x */ "P", "Q", "R", "S", "T", "U", "V", "W", /* 013x */ "X", "Y", "Z", "[", "``", "]", "\223" /* circumflex */, "\227" /* dotaccent */, /* 014x */ "`", "a", "b", "c", "d", "e", "f", "g", /* 015x */ "h", "i", "j", "k", "l", "m", "n", "o", /* 016x */ "p", "q", "r", "s", "t", "u", "v", "w", /* 017x */ "x", "y", "z", "--", /* en dash */ "---", /* em dash */ "\235", /* hungarumlaut */ "~", "\250" /* dieresis */ };#define FIRSTCorkSpecialGlyphs FirstDvips#define LASTCorkSpecialGlyphs (FirstDvips+0277)static const char *CorkSpecialGlyphs[] = { /* 000 - accents for lowercase letters */ "`", "'", "^", "~", "\230", /* umlaut/dieresis */ "\235", /* hungarumlaut */ "\232", /* ring */ "\237", /* hacek/caron */ "\226", /* breve */ "\257", /* macron */ "\227", /* dot above/dotaccent */ "\270", /* cedilla */ "\236", /* ogonek */ /* 015 - miscellaneous */ "'", /* single base quote/quotesinglbase */ "<", /* single opening guillemet/guilsinglleft */ ">", /* single closing guillemet/guilsinglright */ "``", /* english opening quotes/quotedblleft */ "''", /* english closing quotes/quotedblright */ ",,", /* base quotes/quotedblbase */ "<<", /* opening guillemets/guillemotleft */ ">>", /* closing guillemets/guillemotright */ "--", /* en dash/endash */ "---", /* em dash/emdash */ "", /* compound work mark (invisible)/ */ "o", /* perthousandzero (used in conjunction with %) */ "\220", /* dotless i/dotlessi */ "j", /* dotless j */ "ff", /* ligature ff */ "fi", /* ligature fi */ "fl", /* ligature fl */ "ffi", /* ligature ffi */ "ffl", /* ligature ffl */ "_", /* visible space */ /* 041 - ASCII */ "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", "=", ">", "?", "@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "\\","]", "^", "_", "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "{", "|", "}", "~", "\255", /* hyphenchar (hanging) */ /* 200 - letters for eastern European languages from latin-2 */ "A\226", /* Abreve */ "A\236", /* Aogonek */ "C\264", /* Cacute */ "C\237", /* Chacek */ "D\237", /* Dhacek */ "E\237", /* Ehacek */ "E\236", /* Eogonek */ "G\226", /* Gbreve */ "L\264", /* Lacute */ "L\237", /* Lhacek */ "L/", /* Lslash/Lstroke */ "N\264", /* Nacute */ "N\237", /* Nhacek */ "\\NG", /* Eng */ "O\235", /* Ohungarumlaut */ "R\264", /* Racute */ "R\237", /* Rhacek */ "S\264", /* Sacute */ "S\237", /* Shacek */ "S\270", /* Scedilla */ "T\237", /* Thacek */ "T\270", /* Tcedilla */ "U\235", /* Uhungarumlaut */ "U\232", /* Uring */ "Y\250", /* Ydieresis */ "Z\264", /* Zacute */ "Z\237", /* Zhacek */ "Z\227", /* Zdot */ "IJ", /* IJ */ "I\227", /* Idot */ "\\dj", /* dbar */ "\247", /* section */ "a\226", /* abreve */ "a\236", /* aogonek */ "c\222", /* cacute */ "c\237", /* chacek */ "d\237", /* dhacek */ "e\237", /* ehacek */ "e\236", /* eogonek */ "g\226", /* gbreve */ "l\222", /* lacute */ "l\237", /* lhacek */ "l/", /* lslash */ "n\222", /* nacute */ "n\237", /* nhacek */ "\\ng", /* eng */ "o\235", /* ohungarumlaut */ "r\222", /* racute */ "r\237", /* rhacek */ "s\222", /* sacute */ "s\237", /* shacek */ "s\270", /* scedilla */ "t\237", /* thacek */ "t\270", /* tcedilla */ "u\235", /* uhungarumlaut */ "u\232", /* uring */ "y\230", /* ydieresis */ "z\222", /* zacute */ "z\237", /* zhacek */ "z\227", /* zdot */ "ij", /* ij */ "\241", /* exclamdown */ "\277", /* questiondown */ "\243" /* sterling */ /* 0300-0377 is same as ISO 8859/1 except: 0337 is Ess-zed and 0377 is ess-zed/germandbls */};/* There are gaps in the set of printable ISOLatin1 characters: *//*CONST ISOLatin1Gaps = SET OF [0..255] { 8_0..8_37, 8_177..8_217, 8_231, 8_234};*/typedef struct { double blx, bly, toprx, topry; /* font matrix in character coordinates */ struct {double x, y;} chr[256]; /* widths in character coordinates */} MetricsRec;typedef MetricsRec *Metrics;typedef Metrics MetricsTable[];typedef GlyphIndex EncodingVector[256];typedef EncodingVector *Encoding;typedef Encoding EncodingTable[];typedef struct { double x, y; /* (1000,0) in font's character coordinate system */ double xp, yp; /* (0,1000) in font's character coordinate system */ int e; /* index in "encoding" */ int m; /* index in "metrics" */ double bx, by, tx, ty; /* height of font bbox in reporting coordinates */} FontRec;typedef FontRec *Font;typedef Font FontTable[];/* Instance "T". */#define state_normal 0#define state_metrics 1#define state_encoding 2typedef struct { double itransform[6]; /* transform from device to default coordinates */ int metricsSize; MetricsTable *metrics; int encodingSize; EncodingTable *encoding; BOOLEAN dvipsIsCork; /* assume Cork rather than "OT1" for dvips output */ int fontSize; FontTable *font; /* Data for current word prefix: */ char buf[1000]; int lbuf; /* elements 0 through "lbuf-1" of "buf" are in use */ int f; /* font number */ double x0, y0, x1, y1; /* initial and final currentpoint */ BOOLEAN nonEmptyPage; long blx, bly, toprx, topry; /* bounding box of last word output */ char word[1000]; /* last word output */ int state; /* state-specific components: */ /* state_encoding: */ int encoding_e, encoding_n, encoding_i; /* state_metrics: */ int metrics_m, metrics_i;} T;static int ReadChar(char **instr);static void UnreadChar(char **instr);static int ReadInt(char **instr);static long ReadLong(char **instr);static int ParseInverseTransform(T *t, char *instr);static int ParseEncoding(T *t, char *instr);static int ParseEncodingMore(T *t, char *instr);static void ReadPair(double *x, double *y, char **instr);static int ParseFont(T *t, char *instr);static int ParseMetrics(T *t, char *instr);static int ParseMetricsMore(T *t, char *instr);static void Itransform(T *t, double *x1, double *y1, double x0, double y0);static void Output(T *t, const char **pre, const char **word, int *llx, int *lly, int *urx, int *ury);static BOOLEAN SameDirection(double x0, double y0, double x1, double y1);static int ParseString( T *t, char *instr, const char **pre, const char **word, const char **post, int *llx, int *lly, int *urx, int *ury);int DLLEXPORT pstotextInit(void **instance) { T *t; int i; t = (T *)malloc(sizeof(T)); if (t == NULL) return PSTOTEXT_INIT_MALLOC; t->state = state_normal; /* Initialize t->itransform to the identity transform. */ t->itransform[0] = 1.0; t->itransform[1] = 0.0; t->itransform[2] = 0.0; t->itransform[3] = 1.0; t->itransform[4] = 0.0; t->itransform[5] = 0.0; t->metricsSize = t->encodingSize = t->fontSize = 100; t->metrics = (MetricsTable *)malloc(t->metricsSize * sizeof(Metrics)); if (t->metrics == NULL) { free(t); return PSTOTEXT_INIT_MALLOC; } for(i=0; i<t->metricsSize; i++)(*t->metrics)[i] = NULL; t->encoding = (EncodingTable *)malloc(t->encodingSize * sizeof(Encoding)); if (t->encoding == NULL) { free(t); return PSTOTEXT_INIT_MALLOC; } for(i=0;i<t->encodingSize;i++)(*t->encoding)[i] = NULL; t->dvipsIsCork = FALSE; t->font = (FontTable *)malloc(t->fontSize * sizeof(Font)); if (t->font == NULL) { free(t); return PSTOTEXT_INIT_MALLOC; } for(i=0;i<t->fontSize;i++)(*t->font)[i] = NULL; t->lbuf = 0; t->nonEmptyPage = FALSE; t->blx = t->bly = t->toprx = t->topry = 0; *instance = t; return 0;}int DLLEXPORT pstotextSetCork(void *instance, int value) { T *t = (T *)instance; t->dvipsIsCork = value; return 0; }int DLLEXPORT pstotextExit(void *instance) { T *t = (T *)instance; free(t->metrics); free(t->encoding); free(t->font); free(t); return 0;}static int ReadChar(char **instr) { int c = **(unsigned char**)instr; (*instr)++; return c;}static void UnreadChar(char **instr) { (*instr)--;}static int ReadInt(char **instr) { int i = 0; int sign = 1; int c; while ((c = ReadChar(instr))==' ') /* skip */ ; if (c=='-') {sign = -1; c = ReadChar(instr); } while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);} UnreadChar(instr); return i*sign;}static long ReadLong(char **instr) { long i = 0; int sign = 1; int c; while ((c = ReadChar(instr))==' ') /* skip */ ; if (c=='-') {sign = -1; c = ReadChar(instr); } while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);} UnreadChar(instr); return i*sign;}static int ParseInverseTransform(T *t, char *instr) { int i; for (i = 0; i<6; i++) t->itransform[i] = ReadLong(&instr) / 100.0; return 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -