📄 sgmldecl.c
字号:
/* sgmldecl.c - SGML declaration parsing. Written by James Clark (jjc@jclark.com).*/#include "sgmlincl.h"/* Symbolic names for the error numbers that are be generated only bythis module. */#define E_SHUNCHAR 159#define E_STANDARD 163#define E_SIGNIFICANT 164#define E_BADLIT 165#define E_SCOPE 166#define E_XNUM 167#define E_BADVERSION 168#define E_NMUNSUP 169#define E_XNMLIT 170#define E_CHARDESC 171#define E_CHARDUP 172#define E_CHARRANGE 173#define E_7BIT 174#define E_CHARMISSING 175#define E_SHUNNED 176#define E_NONSGML 177#define E_CAPSET 178#define E_CAPMISSING 179#define E_SYNTAX 180#define E_CHARNUM 181#define E_SWITCHES 182#define E_INSTANCE 183#define E_ZEROFEATURE 184#define E_YESNO 185#define E_CAPACITY 186#define E_NOTSUPPORTED 187#define E_FORMAL 189#define E_BADCLASS 190#define E_MUSTBENON 191#define E_BADBASECHAR 199#define E_SYNREFUNUSED 200#define E_SYNREFUNDESC 201#define E_SYNREFUNKNOWN 202#define E_SYNREFUNKNOWNSET 203#define E_FUNDUP 204#define E_BADFUN 205#define E_FUNCHAR 206#define E_GENDELIM 207#define E_SRDELIM 208#define E_BADKEY 209#define E_BADQUANTITY 210#define E_BADNAME 211#define E_REFNAME 212#define E_DUPNAME 213#define E_QUANTITY 214#define E_QTOOBIG 215#define E_NMSTRTCNT 219#define E_NMCHARCNT 220#define E_NMDUP 221#define E_NMBAD 222#define E_NMMINUS 223#define E_UNKNOWNSET 227#define E_TOTALCAP 235#define CANON_NMC '.' /* Canonical name character. */#define CANON_NMS 'A' /* Canonical name start character. */#define CANON_MIN ':' /* Canonical minimum data character. */#define SUCCESS 1#define FAIL 0#define SIZEOF(v) (sizeof(v)/sizeof(v[0]))#define matches(tok, str) (ustrcmp((tok)+1, (str)) == 0)static UNCH standard[] = "ISO 8879:1986";#define REFERENCE_SYNTAX "ISO 8879:1986//SYNTAX Reference//EN"#define CORE_SYNTAX "ISO 8879:1986//SYNTAX Core//EN"static UNCH (*newkey)[REFNAMELEN+1] = 0;struct pmap { char *name; UNIV value;};/* The reference capacity set. */#define REFCAPSET \{ 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, \35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L }long refcapset[NCAPACITY] = REFCAPSET;/* A pmap of known capacity sets. */static struct pmap capset_map[] = { { "ISO 8879:1986//CAPACITY Reference//EN", (UNIV)refcapset }, { 0 },};/* Table of capacity names. Must match *CAP in sgmldecl.h. */char *captab[] = { "TOTALCAP", "ENTCAP", "ENTCHCAP", "ELEMCAP", "GRPCAP", "EXGRPCAP", "EXNMCAP", "ATTCAP", "ATTCHCAP", "AVGRPCAP", "NOTCAP", "NOTCHCAP", "IDCAP", "IDREFCAP", "MAPCAP", "LKSETCAP", "LKNMCAP",};/* The default SGML declaration. */#define MAXNUMBER 99999999L/* Reference quantity set */#define REFATTCNT 40#define REFATTSPLEN 960#define REFBSEQLEN 960#define REFDTAGLEN 16#define REFDTEMPLEN 16#define REFENTLVL 16#define REFGRPCNT 32#define REFGRPGTCNT 96#define REFGRPLVL 16#define REFNORMSEP 2#define REFPILEN 240#define REFTAGLEN 960#define REFTAGLVL 24#define ALLOC_MAX 65534#define BIGINT 30000#define MAXATTCNT ((ALLOC_MAX/sizeof(struct ad)) - 2)#define MAXATTSPLEN BIGINT#define MAXBSEQLEN BIGINT#define MAXDTAGLEN 16#define MAXDTEMPLEN 16#define MAXENTLVL ((ALLOC_MAX/sizeof(struct source)) - 1)#define MAXGRPCNT MAXGRPGTCNT/* Must be between 96 and 253 */#define MAXGRPGTCNT 253#define MAXGRPLVL MAXGRPGTCNT#define MAXLITLEN BIGINT/* This guarantees that NAMELEN < LITLEN (ie there's always space for a namein a buffer intended for a literal.) */#define MAXNAMELEN (REFLITLEN - 1)#define MAXNORMSEP 2#define MAXPILEN BIGINT#define MAXTAGLEN BIGINT#define MAXTAGLVL ((ALLOC_MAX/sizeof(struct tag)) - 1)/* Table of quantity names. Must match Q* in sgmldecl.h. */static char *quantity_names[] = { "ATTCNT", "ATTSPLEN", "BSEQLEN", "DTAGLEN", "DTEMPLEN", "ENTLVL", "GRPCNT", "GRPGTCNT", "GRPLVL", "LITLEN", "NAMELEN", "NORMSEP", "PILEN", "TAGLEN", "TAGLVL", };static int max_quantity[] = { MAXATTCNT, MAXATTSPLEN, MAXBSEQLEN, MAXDTAGLEN, MAXDTEMPLEN, MAXENTLVL, MAXGRPCNT, MAXGRPGTCNT, MAXGRPLVL, MAXLITLEN, MAXNAMELEN, MAXNORMSEP, MAXPILEN, MAXTAGLEN, MAXTAGLVL,};static char *quantity_changed;/* Non-zero means the APPINFO parameter was not NONE. */static int appinfosw = 0;struct sgmldecl sd = { REFCAPSET, /* capacity */#ifdef SUPPORT_SUBDOC MAXNUMBER, /* subdoc */#else /* not SUPPORT_SUBDOC */ 0, /* subdoc */#endif /* not SUPPORT_SUBDOC */ 1, /* formal */ 1, /* omittag */ 1, /* shorttag */ 1, /* shortref */ { 1, 0 }, /* general/entity name case translation */ { /* reference quantity set */ REFATTCNT, REFATTSPLEN, REFBSEQLEN, REFDTAGLEN, REFDTEMPLEN, REFENTLVL, REFGRPCNT, REFGRPGTCNT, REFGRPLVL, REFLITLEN, REFNAMELEN, REFNORMSEP, REFPILEN, REFTAGLEN, REFTAGLVL, },};static int systemcharset[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,};/* This is a private use designating sequence that by conventionrefers to the whole system character set whatever it is. */#define SYSTEM_CHARSET_DESIGNATING_SEQUENCE "ESC 2/5 2/15 3/0"static struct pmap charset_map[] = { { "ESC 2/5 4/0", (UNIV)iso646charset }, /* ISO 646 IRV */ { "ESC 2/8 4/2", (UNIV)iso646G0charset }, /* ISO Registration Number 6, ASCII */ { "ESC 2/8 4/0", (UNIV)iso646G0charset }, /* ISO Registration Number 6, ASCII */ { "ESC 2/13 4/1", (UNIV)iso8859_1charset }, /* Latin 1 */ { "ESC 2/1 4/0", (UNIV)iso646C0charset }, /* ISO 646, C0 */ { "ESC 2/2 4/3", (UNIV)iso6429C1charset }, /* ISO 6429, C1 */ { SYSTEM_CHARSET_DESIGNATING_SEQUENCE, (UNIV)systemcharset }, /* system character set */ { 0 }};static int synrefcharset[256]; /* the syntax reference character set */#define CHAR_NONSGML 01#define CHAR_SIGNIFICANT 02#define CHAR_MAGIC 04#define CHAR_SHUNNED 010static UNCH char_flags[256];static int done_nonsgml = 0;static UNCH *nlextoke = 0; /* new lextoke */static UNCH *nlextran = 0; /* new lextran */#define MAX_SAVED_ERRS 4static UNIV saved_errs[MAX_SAVED_ERRS];static int nsaved_errs = 0;static UNCH kcharset[] = "CHARSET";static UNCH kbaseset[] = "BASESET";static UNCH kdescset[] = "DESCSET";static UNCH kunused[] = "UNUSED";static UNCH kcapacity[] = "CAPACITY";static UNCH kpublic[] = "PUBLIC";static UNCH ksgmlref[] = "SGMLREF";static UNCH kscope[] = "SCOPE";static UNCH kdocument[] = "DOCUMENT";static UNCH kinstance[] = "INSTANCE";static UNCH ksyntax[] = "SYNTAX";static UNCH kswitches[] = "SWITCHES";static UNCH kfeatures[] = "FEATURES";static UNCH kminimize[] = "MINIMIZE";static UNCH kdatatag[] = "DATATAG";static UNCH komittag[] = "OMITTAG";static UNCH krank[] = "RANK";static UNCH kshorttag[] = "SHORTTAG";static UNCH klink[] = "LINK";static UNCH ksimple[] = "SIMPLE";static UNCH kimplicit[] = "IMPLICIT";static UNCH kexplicit[] = "EXPLICIT";static UNCH kother[] = "OTHER";static UNCH kconcur[] = "CONCUR";static UNCH ksubdoc[] = "SUBDOC";static UNCH kformal[] = "FORMAL";static UNCH kyes[] = "YES";static UNCH kno[] = "NO";static UNCH kappinfo[] = "APPINFO";static UNCH knone[] = "NONE";static UNCH kshunchar[] = "SHUNCHAR";static UNCH kcontrols[] = "CONTROLS";static UNCH kfunction[] = "FUNCTION";static UNCH krs[] = "RS";static UNCH kre[] = "RE";static UNCH kspace[] = "SPACE";static UNCH knaming[] = "NAMING";static UNCH klcnmstrt[] = "LCNMSTRT";static UNCH kucnmstrt[] = "UCNMSTRT";static UNCH klcnmchar[] = "LCNMCHAR";static UNCH kucnmchar[] = "UCNMCHAR";static UNCH knamecase[] = "NAMECASE";static UNCH kdelim[] = "DELIM";static UNCH kgeneral[] = "GENERAL";static UNCH kentity[] = "ENTITY";static UNCH kshortref[] = "SHORTREF";static UNCH knames[] = "NAMES";static UNCH kquantity[] = "QUANTITY";#define sderr mderrstatic UNIV pmaplookup P((struct pmap *, char *));static UNCH *ltous P((long));static VOID sdfixstandard P((UNCH *, int));static int sdparm P((UNCH *, struct parse *));static int sdname P((UNCH *, UNCH *));static int sdckname P((UNCH *, UNCH *));static int sdversion P((UNCH *));static int sdcharset P((UNCH *));static int sdcsdesc P((UNCH *, int *));static int sdpubcapacity P((UNCH *));static int sdcapacity P((UNCH *));static int sdscope P((UNCH *));static VOID setlexical P((void));static VOID noemptytag P((void));static int sdpubsyntax P((UNCH *));static int sdsyntax P((UNCH *));static int sdxsyntax P((UNCH *));static int sdtranscharnum P((UNCH *));static int sdtranschar P((int));static int sdshunchar P((UNCH *));static int sdsynref P((UNCH *));static int sdfunction P((UNCH *));static int sdnaming P((UNCH *));static int sddelim P((UNCH *));static int sdnames P((UNCH *));static int sdquantity P((UNCH *));static int sdfeatures P((UNCH *));static int sdappinfo P((UNCH *));static VOID sdsaverr P((UNS, UNCH *, UNCH *));static VOID bufsalloc P((void));static VOID bufsrealloc P((void));/* Parse the SGML declaration. Return non-zero if there was some appinfo. */int sgmldecl(){ int i; int errsw = 0; UNCH endbuf[REFNAMELEN+2]; /* buffer for parsing terminating > */ static int (*section[]) P((UNCH *)) = { sdversion, sdcharset, sdcapacity, sdscope, sdsyntax, sdfeatures, sdappinfo, }; /* These are needed if we use mderr. */ parmno = 0; mdname = sgmlkey; subdcl = NULL; nsaved_errs = 0; for (i = 0; i < SIZEOF(section); i++) if ((*section[i])(tbuf) == FAIL) { errsw = 1; break; } if (sd.formal) { /* print saved errors */ int i; for (i = 0; i < nsaved_errs; i++) svderr(saved_errs[i]); } else { /* free saved errors */ int i; for (i = 0; i < nsaved_errs; i++) msgsfree(saved_errs[i]); } if (!errsw) setlexical(); bufsrealloc(); /* Parse the >. Don't overwrite the appinfo. */ if (!errsw) sdparm(endbuf, 0); /* We must exit if we hit end of document. */ if (pcbsd.action == EOD_) exiterr(161, &pcbsd); if (!errsw && pcbsd.action != ESGD) sderr(126, (UNCH *)0, (UNCH *)0); return appinfosw;}/* Parse the literal (which should contain the version of thestandard) at the beginning of a SGML declaration. */static int sdversion(tbuf)UNCH *tbuf;{ if (sdparm(tbuf, &pcblitv) != LIT1) { sderr(123, (UNCH *)0, (UNCH *)0); return FAIL; } sdfixstandard(tbuf, 0); if (ustrcmp(tbuf, standard) != 0) sderr(E_BADVERSION, tbuf, standard); return SUCCESS;}/* Parse the CHARSET section. Use one token lookahead. */static int sdcharset(tbuf)UNCH *tbuf;{ int i; int status[256]; if (sdname(tbuf, kcharset) == FAIL) return FAIL; (void)sdparm(tbuf, 0); if (sdcsdesc(tbuf, status) == FAIL) return FAIL;#if 0 for (i = 128; i < 256; i++) if (status[i] != UNDESC) break; if (i >= 256) { /* Only a 7-bit character set was described. Fill it out to 8-bits. */ for (i = 128; i < 256; i++) status[i] = UNUSED;#if 0 sderr(E_7BIT, (UNCH *)0, (UNCH *)0);#endif }#endif /* Characters that are declared UNUSED in the document character set are assigned to non-SGML. */ for (i = 0; i < 256; i++) { if (status[i] == UNDESC) {#if 0 sderr(E_CHARMISSING, ltous((long)i), (UNCH *)0);#endif char_flags[i] |= CHAR_NONSGML; } else if (status[i] == UNUSED) char_flags[i] |= CHAR_NONSGML; } done_nonsgml = 1; return SUCCESS;}/* Parse a character set description. Uses one character lookahead. */static int sdcsdesc(tbuf, status)UNCH *tbuf;int *status;{ int i; int nsets = 0; struct fpi fpi; for (i = 0; i < 256; i++) status[i] = UNDESC; for (;;) { int nchars; int *baseset = 0; if (pcbsd.action != NAS1) { if (nsets == 0) { sderr(120, (UNCH *)0, (UNCH *)0); return FAIL; } break; } if (!matches(tbuf, kbaseset)) { if (nsets == 0) { sderr(118, tbuf+1, kbaseset); return FAIL; } break; } nsets++; MEMZERO((UNIV)&fpi, FPISZ); if (sdparm(tbuf, &pcblitv) != LIT1) { sderr(123, (UNCH *)0, (UNCH *)0); return FAIL; } fpi.fpipubis = tbuf; /* Give a warning if it is not a CHARSET fpi. */ if (parsefpi(&fpi)) sdsaverr(E_FORMAL, (UNCH *)0, (UNCH *)0); else if (fpi.fpic != FPICHARS) sdsaverr(E_BADCLASS, kcharset, (UNCH *)0); else { fpi.fpipubis[fpi.fpil + fpi.fpill] = '\0'; baseset = (int *)pmaplookup(charset_map, (char *)fpi.fpipubis + fpi.fpil); if (!baseset) sderr(E_UNKNOWNSET, fpi.fpipubis + fpi.fpil, (UNCH *)0); } if (sdname(tbuf, kdescset) == FAIL) return FAIL; nchars = 0; for (;;) { long start, count; long basenum; if (sdparm(tbuf, 0) != NUM1) break; start = atol((char *)tbuf); if (sdparm(tbuf, 0) != NUM1) { sderr(E_XNUM, (UNCH *)0, (UNCH *)0); return FAIL; } count = atol((char *)tbuf); switch (sdparm(tbuf, &pcblitv)) { case NUM1: basenum = atol((char *)tbuf); break; case LIT1: basenum = UNKNOWN; break; case NAS1: if (matches(tbuf, kunused)) { basenum = UNUSED; break; } /* fall through */ default: sderr(E_CHARDESC, ltous(start), (UNCH *)0); return FAIL; } if (start + count > 256) sderr(E_CHARRANGE, (UNCH *)0, (UNCH *)0); else { int i; int lim = (int)start + count; for (i = (int)start; i < lim; i++) { if (status[i] != UNDESC) sderr(E_CHARDUP, ltous((long)i), (UNCH *)0); else if (basenum == UNUSED || basenum == UNKNOWN) status[i] = (int)basenum; else if (baseset == 0) status[i] = UNKNOWN_SET; else { int n = basenum + (i - start); if (n < 0 || n > 255) sderr(E_CHARRANGE, (UNCH *)0, (UNCH *)0); else { if (baseset[n] == UNUSED) sderr(E_BADBASECHAR, ltous((long)n), (UNCH *)0); status[i] = baseset[n]; } } } } nchars++; } if (nchars == 0) { sderr(E_XNUM, (UNCH *)0, (UNCH *)0); return FAIL; } } return SUCCESS;}/* Parse the CAPACITY section. Uses one token lookahead. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -