📄 pdf_cmap.c.svn-base
字号:
/* * The CMap data structure here is constructed on the fly by * adding simple range-to-range mappings. Then the data structure * is optimized to contain both range-to-range and range-to-table * lookups. * * Any one-to-many mappings are inserted as one-to-table * lookups in the beginning, and are not affected by the optimization * stage. * * There is a special function to add a 256-length range-to-table mapping. * The ranges do not have to be added in order. * * This code can be a lot simpler if we don't care about wasting memory, * or can trust the parser to give us optimal mappings. */#include "fitz.h"#include "mupdf.h"typedef struct pdf_range_s pdf_range;enum { MAXCODESPACE = 10 };enum { SINGLE, RANGE, TABLE, MULTI };struct pdf_range_s{ int low; int high; int flag; /* what kind of lookup is this */ int offset; /* either range-delta or table-index */};static intcmprange(const void *va, const void *vb){ return ((const pdf_range*)va)->low - ((const pdf_range*)vb)->low;}struct pdf_cmap_s{ int refs; int staticdata; char cmapname[32]; char usecmapname[32]; pdf_cmap *usecmap; int wmode; int ncspace; struct { int n; unsigned char lo[4]; unsigned char hi[4]; } cspace[MAXCODESPACE]; int rlen, rcap; pdf_range *ranges; int tlen, tcap; int *table;};/* * Allocate, destroy and simple parameters. */fz_error *pdf_newcmap(pdf_cmap **cmapp){ pdf_cmap *cmap; cmap = *cmapp = fz_malloc(sizeof(pdf_cmap)); if (!cmap) return fz_throw("outofmem: cmap struct"); cmap->refs = 1; cmap->staticdata = 0; strcpy(cmap->cmapname, ""); strcpy(cmap->usecmapname, ""); cmap->usecmap = nil; cmap->wmode = 0; cmap->ncspace = 0; cmap->rlen = 0; cmap->rcap = 0; cmap->ranges = nil; cmap->tlen = 0; cmap->tcap = 0; cmap->table = nil; return fz_okay;}pdf_cmap *pdf_keepcmap(pdf_cmap *cmap){ cmap->refs ++; return cmap;}voidpdf_dropcmap(pdf_cmap *cmap){ if (--cmap->refs == 0) { if (cmap->usecmap) pdf_dropcmap(cmap->usecmap); if (!cmap->staticdata) { fz_free(cmap->ranges); fz_free(cmap->table); } fz_free(cmap); }}pdf_cmap *pdf_getusecmap(pdf_cmap *cmap){ return cmap->usecmap;}voidpdf_setusecmap(pdf_cmap *cmap, pdf_cmap *usecmap){ int i; if (cmap->usecmap) pdf_dropcmap(cmap->usecmap); cmap->usecmap = pdf_keepcmap(usecmap); if (cmap->ncspace == 0) { cmap->ncspace = usecmap->ncspace; for (i = 0; i < usecmap->ncspace; i++) cmap->cspace[i] = usecmap->cspace[i]; }}intpdf_getwmode(pdf_cmap *cmap){ return cmap->wmode;}voidpdf_setwmode(pdf_cmap *cmap, int wmode){ cmap->wmode = wmode;}voidpdf_debugcmap(pdf_cmap *cmap){ int i, k, n; printf("cmap $%p /%s {\n", (void *) cmap, cmap->cmapname); if (cmap->usecmapname[0]) printf(" usecmap /%s\n", cmap->usecmapname); if (cmap->usecmap) printf(" usecmap $%p\n", (void *) cmap->usecmap); printf(" wmode %d\n", cmap->wmode); printf(" codespaces {\n"); for (i = 0; i < cmap->ncspace; i++) { printf(" <"); for (k = 0; k < cmap->cspace[i].n; k++) printf("%02x", cmap->cspace[i].lo[k]); printf("> <"); for (k = 0; k < cmap->cspace[i].n; k++) printf("%02x", cmap->cspace[i].hi[k]); printf(">\n"); } printf(" }\n"); printf(" ranges (%d,%d) {\n", cmap->rlen, cmap->tlen); for (i = 0; i < cmap->rlen; i++) { pdf_range *r = &cmap->ranges[i]; printf(" <%04x> <%04x> ", r->low, r->high); if (r->flag == TABLE) { printf("[ "); for (k = 0; k < r->high - r->low + 1; k++) printf("%d ", cmap->table[r->offset + k]); printf("]\n"); } else if (r->flag == MULTI) { printf("< "); n = cmap->table[r->offset]; for (k = 0; k < n; k++) printf("%04x ", cmap->table[r->offset + 1 + k]); printf(">\n"); } else printf("%d\n", r->offset); } printf(" }\n}\n");}/* * Add a codespacerange section. * These ranges are used by pdf_decodecmap to decode * multi-byte encoded strings. */fz_error *pdf_addcodespace(pdf_cmap *cmap, unsigned lo, unsigned hi, int n){ int i; assert(!cmap->staticdata); if (cmap->ncspace + 1 == MAXCODESPACE) return fz_throw("assert: too many code space ranges"); cmap->cspace[cmap->ncspace].n = n; for (i = 0; i < n; i++) { int o = (n - i - 1) * 8; cmap->cspace[cmap->ncspace].lo[i] = (lo >> o) & 0xFF; cmap->cspace[cmap->ncspace].hi[i] = (hi >> o) & 0xFF; } cmap->ncspace ++; return fz_okay;}/* * Add an integer to the table. */static fz_error *addtable(pdf_cmap *cmap, int value){ assert(!cmap->staticdata); if (cmap->tlen + 1 > cmap->tcap) { int newcap = cmap->tcap == 0 ? 256 : cmap->tcap * 2; int *newtable = fz_realloc(cmap->table, newcap * sizeof(int)); if (!newtable) return fz_throw("outofmem: cmap table"); cmap->tcap = newcap; cmap->table = newtable; } cmap->table[cmap->tlen++] = value; return fz_okay;}/* * Add a range. */static fz_error *addrange(pdf_cmap *cmap, int low, int high, int flag, int offset){ assert(!cmap->staticdata); if (cmap->rlen + 1 > cmap->rcap) { pdf_range *newranges; int newcap = cmap->rcap == 0 ? 256 : cmap->rcap * 2; newranges = fz_realloc(cmap->ranges, newcap * sizeof(pdf_range)); if (!newranges) return fz_throw("outofmem: cmap ranges"); cmap->rcap = newcap; cmap->ranges = newranges; } cmap->ranges[cmap->rlen].low = low; cmap->ranges[cmap->rlen].high = high; cmap->ranges[cmap->rlen].flag = flag; cmap->ranges[cmap->rlen].offset = offset; cmap->rlen ++; return fz_okay;}/* * Add a range-to-table mapping. */fz_error *pdf_maprangetotable(pdf_cmap *cmap, int low, int *table, int len){ fz_error *error; int offset; int high; int i; high = low + len; offset = cmap->tlen; for (i = 0; i < len; i++) { error = addtable(cmap, table[i]); if (error) return fz_rethrow(error, "cannot add range-to-table index"); } error = addrange(cmap, low, high, TABLE, offset); if (error) return fz_rethrow(error, "cannot add range-to-table range"); return fz_okay;}/* * Add a range of contiguous one-to-one mappings (ie 1..5 maps to 21..25) */fz_error *pdf_maprangetorange(pdf_cmap *cmap, int low, int high, int offset){ fz_error *error; error = addrange(cmap, low, high, high - low == 0 ? SINGLE : RANGE, offset); if (error) return fz_rethrow(error, "cannot add range-to-range mapping"); return fz_okay;}/* * Add a single one-to-many mapping. */fz_error *pdf_maponetomany(pdf_cmap *cmap, int low, int *values, int len){ fz_error *error; int offset; int i; if (len == 1) { error = addrange(cmap, low, low, SINGLE, values[0]); if (error) return fz_rethrow(error, "cannot add one-to-one mapping"); return fz_okay; } offset = cmap->tlen; error = addtable(cmap, len); if (error) return fz_rethrow(error, "cannot add one-to-many table length"); for (i = 0; i < len; i++) { error = addtable(cmap, values[i]); if (error) return fz_rethrow(error, "cannot add one-to-many table index"); } error = addrange(cmap, low, low, MULTI, offset); if (error) return fz_rethrow(error, "cannot add one-to-many mapping"); return fz_okay;}/* * Sort the input ranges. * Merge contiguous input ranges to range-to-range if the output is contiguous. * Merge contiguous input ranges to range-to-table if the output is random. */fz_error *pdf_sortcmap(pdf_cmap *cmap){ fz_error *error; pdf_range *newranges; int *newtable; pdf_range *a; /* last written range on output */ pdf_range *b; /* current range examined on input */ assert(!cmap->staticdata); if (cmap->rlen == 0) return fz_okay; qsort(cmap->ranges, cmap->rlen, sizeof(pdf_range), cmprange); a = cmap->ranges; b = cmap->ranges + 1; while (b < cmap->ranges + cmap->rlen) { /* ignore one-to-many mappings */ if (b->flag == MULTI) { *(++a) = *b; } /* input contiguous */ else if (a->high + 1 == b->low) { /* output contiguous */ if (a->high - a->low + a->offset + 1 == b->offset) { /* SR -> R and SS -> R and RR -> R and RS -> R */ if (a->flag == SINGLE || a->flag == RANGE) { a->flag = RANGE; a->high = b->high; } /* LS -> L */ else if (a->flag == TABLE && b->flag == SINGLE) { a->high = b->high; error = addtable(cmap, b->offset); if (error) return fz_rethrow(error, "cannot convert LS -> L"); } /* LR -> LR */ else if (a->flag == TABLE && b->flag == RANGE) { *(++a) = *b; } /* XX -> XX */ else { *(++a) = *b; } } /* output separated */ else { /* SS -> L */ if (a->flag == SINGLE && b->flag == SINGLE) { a->flag = TABLE; a->high = b->high; error = addtable(cmap, a->offset); if (error) return fz_rethrow(error, "cannot convert SS -> L"); error = addtable(cmap, b->offset); if (error) return fz_rethrow(error, "cannot convert SS -> L"); a->offset = cmap->tlen - 2; } /* LS -> L */ else if (a->flag == TABLE && b->flag == SINGLE) { a->high = b->high; error = addtable(cmap, b->offset); if (error) return fz_rethrow(error, "cannot convert LS -> L"); } /* XX -> XX */ else { *(++a) = *b; } } } /* input separated: XX -> XX */ else { *(++a) = *b; } b ++; } cmap->rlen = a - cmap->ranges + 1; newranges = fz_realloc(cmap->ranges, cmap->rlen * sizeof(pdf_range)); if (!newranges) return fz_throw("outofmem: cmap ranges"); cmap->rcap = cmap->rlen; cmap->ranges = newranges; if (cmap->tlen) { newtable = fz_realloc(cmap->table, cmap->tlen * sizeof(int)); if (!newtable) return fz_throw("outofmem: cmap table"); cmap->tcap = cmap->tlen; cmap->table = newtable; } return fz_okay;}/* * Lookup the mapping of a codepoint. */intpdf_lookupcmap(pdf_cmap *cmap, int cpt){ int l = 0; int r = cmap->rlen - 1; int m; while (l <= r) { m = (l + r) >> 1; if (cpt < cmap->ranges[m].low) r = m - 1; else if (cpt > cmap->ranges[m].high) l = m + 1; else { int i = cpt - cmap->ranges[m].low + cmap->ranges[m].offset; if (cmap->ranges[m].flag == TABLE) return cmap->table[i]; if (cmap->ranges[m].flag == MULTI) return -1; return i; } } if (cmap->usecmap) return pdf_lookupcmap(cmap->usecmap, cpt); return -1;}/* * Use the codespace ranges to extract a codepoint from a * multi-byte encoded string. */unsigned char *pdf_decodecmap(pdf_cmap *cmap, unsigned char *buf, int *cpt){ int i, k; for (k = 0; k < cmap->ncspace; k++) { unsigned char *lo = cmap->cspace[k].lo; unsigned char *hi = cmap->cspace[k].hi; int n = cmap->cspace[k].n; int c = 0; for (i = 0; i < n; i++) { if (lo[i] <= buf[i] && buf[i] <= hi[i]) c = (c << 8) | buf[i]; else break; } if (i == n) { *cpt = c; return buf + n; } } *cpt = 0; return buf + 1;}/* * CMap parser */enum{ TUSECMAP = PDF_NTOKENS, TBEGINCODESPACERANGE, TENDCODESPACERANGE, TBEGINBFCHAR, TENDBFCHAR, TBEGINBFRANGE, TENDBFRANGE, TBEGINCIDCHAR, TENDCIDCHAR, TBEGINCIDRANGE, TENDCIDRANGE};static pdf_token_e tokenfromkeyword(char *key){ if (!strcmp(key, "usecmap")) return TUSECMAP; if (!strcmp(key, "begincodespacerange")) return TBEGINCODESPACERANGE; if (!strcmp(key, "endcodespacerange")) return TENDCODESPACERANGE; if (!strcmp(key, "beginbfchar")) return TBEGINBFCHAR; if (!strcmp(key, "endbfchar")) return TENDBFCHAR; if (!strcmp(key, "beginbfrange")) return TBEGINBFRANGE; if (!strcmp(key, "endbfrange")) return TENDBFRANGE; if (!strcmp(key, "begincidchar")) return TBEGINCIDCHAR; if (!strcmp(key, "endcidchar")) return TENDCIDCHAR; if (!strcmp(key, "begincidrange")) return TBEGINCIDRANGE; if (!strcmp(key, "endcidrange")) return TENDCIDRANGE; return PDF_TKEYWORD;}static int codefromstring(char *buf, int len){ int a = 0; while (len--) a = (a << 8) | *buf++; return a;}static fz_error *lexcmap(pdf_token_e *tok, fz_stream *file, char *buf, int n, int *sl){ fz_error *error; error = pdf_lex(tok, file, buf, n, sl); if (error) return fz_rethrow(error, "cannot parse cmap token"); if (*tok == PDF_TKEYWORD) *tok = tokenfromkeyword(buf); return fz_okay;}static fz_error *parsecmapname(pdf_cmap *cmap, fz_stream *file){ fz_error *error; char buf[256]; pdf_token_e tok; int len; error = lexcmap(&tok, file, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "syntaxerror in cmap"); if (tok == PDF_TNAME) { strlcpy(cmap->cmapname, buf, sizeof(cmap->cmapname)); return fz_okay; } return fz_throw("expected name");}static fz_error *parsewmode(pdf_cmap *cmap, fz_stream *file){ fz_error *error; char buf[256]; pdf_token_e tok; int len; error = lexcmap(&tok, file, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "syntaxerror in cmap"); if (tok == PDF_TINT) { pdf_setwmode(cmap, atoi(buf)); return fz_okay; } return fz_throw("expected integer");}static fz_error *parsecodespacerange(pdf_cmap *cmap, fz_stream *file){ fz_error *error; char buf[256]; pdf_token_e tok; int len; int lo, hi; while (1) { error = lexcmap(&tok, file, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "syntaxerror in cmap"); if (tok == TENDCODESPACERANGE) return fz_okay; else if (tok == PDF_TSTRING) { lo = codefromstring(buf, len); error = lexcmap(&tok, file, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "syntaxerror in cmap"); if (tok == PDF_TSTRING) { hi = codefromstring(buf, len); error = pdf_addcodespace(cmap, lo, hi, len); if (error) return fz_rethrow(error, "cannot add code space"); } else break; } else break; } return fz_throw("expected string or endcodespacerange");}static fz_error *parsecidrange(pdf_cmap *cmap, fz_stream *file){ fz_error *error; char buf[256]; pdf_token_e tok; int len; int lo, hi, dst; while (1) { error = lexcmap(&tok, file, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "syntaxerror in cmap"); if (tok == TENDCIDRANGE) return fz_okay; else if (tok != PDF_TSTRING) return fz_throw("expected string or endcidrange"); lo = codefromstring(buf, len); error = lexcmap(&tok, file, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "syntaxerror in cmap"); if (tok != PDF_TSTRING) return fz_throw("expected string"); hi = codefromstring(buf, len); error = lexcmap(&tok, file, buf, sizeof buf, &len); if (error) return fz_rethrow(error, "syntaxerror in cmap"); if (tok != PDF_TINT)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -