⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sgmldecl.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 3 页
字号:
/* sgmldecl.c -   SGML declaration parsing.   Written by James Clark (jjc@jclark.com).*/#include "sgmlincl.h"/* Symbolic names for the error numbers that are be generated only bythis module. */#define E_SHUNCHAR 159#define E_STANDARD 163#define E_SIGNIFICANT 164#define E_BADLIT 165#define E_SCOPE 166#define E_XNUM 167#define E_BADVERSION 168#define E_NMUNSUP 169#define E_XNMLIT 170#define E_CHARDESC 171#define E_CHARDUP 172#define E_CHARRANGE 173#define E_7BIT 174#define E_CHARMISSING 175#define E_SHUNNED 176#define E_NONSGML 177#define E_CAPSET 178#define E_CAPMISSING 179#define E_SYNTAX 180#define E_CHARNUM 181#define E_SWITCHES 182#define E_INSTANCE 183#define E_ZEROFEATURE 184#define E_YESNO 185#define E_CAPACITY 186#define E_NOTSUPPORTED 187#define E_FORMAL 189#define E_BADCLASS 190#define E_MUSTBENON 191#define E_BADBASECHAR 199#define E_SYNREFUNUSED 200#define E_SYNREFUNDESC 201#define E_SYNREFUNKNOWN 202#define E_SYNREFUNKNOWNSET 203#define E_FUNDUP 204#define E_BADFUN 205#define E_FUNCHAR 206#define E_GENDELIM 207#define E_SRDELIM 208#define E_BADKEY 209#define E_BADQUANTITY 210#define E_BADNAME 211#define E_REFNAME 212#define E_DUPNAME 213#define E_QUANTITY 214#define E_QTOOBIG 215#define E_NMSTRTCNT 219#define E_NMCHARCNT 220#define E_NMDUP 221#define E_NMBAD 222#define E_NMMINUS 223#define E_UNKNOWNSET 227#define E_TOTALCAP 235#define CANON_NMC '.'		/* Canonical name character. */#define CANON_NMS 'A'		/* Canonical name start character. */#define CANON_MIN ':'		/* Canonical minimum data character. */#define SUCCESS 1#define FAIL 0#define SIZEOF(v) (sizeof(v)/sizeof(v[0]))#define matches(tok, str) (ustrcmp((tok)+1, (str)) == 0)static UNCH standard[] = "ISO 8879:1986";#define REFERENCE_SYNTAX "ISO 8879:1986//SYNTAX Reference//EN"#define CORE_SYNTAX "ISO 8879:1986//SYNTAX Core//EN"static UNCH (*newkey)[REFNAMELEN+1] = 0;struct pmap {     char *name;     UNIV value;};/* The reference capacity set. */#define REFCAPSET \{ 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, \35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L, 35000L }long refcapset[NCAPACITY] = REFCAPSET;/* A pmap of known capacity sets. */static struct pmap capset_map[] = {     { "ISO 8879:1986//CAPACITY Reference//EN", (UNIV)refcapset },     { 0 },};/* Table of capacity names.  Must match *CAP in sgmldecl.h. */char *captab[] = {     "TOTALCAP",     "ENTCAP",     "ENTCHCAP",     "ELEMCAP",     "GRPCAP",     "EXGRPCAP",     "EXNMCAP",     "ATTCAP",     "ATTCHCAP",     "AVGRPCAP",     "NOTCAP",     "NOTCHCAP",     "IDCAP",     "IDREFCAP",     "MAPCAP",     "LKSETCAP",     "LKNMCAP",};/* The default SGML declaration. */#define MAXNUMBER 99999999L/* Reference quantity set */#define REFATTCNT 40#define REFATTSPLEN 960#define REFBSEQLEN 960#define REFDTAGLEN 16#define REFDTEMPLEN 16#define REFENTLVL 16#define REFGRPCNT 32#define REFGRPGTCNT 96#define REFGRPLVL 16#define REFNORMSEP 2#define REFPILEN 240#define REFTAGLEN 960#define REFTAGLVL 24#define ALLOC_MAX 65534#define BIGINT 30000#define MAXATTCNT ((ALLOC_MAX/sizeof(struct ad)) - 2)#define MAXATTSPLEN BIGINT#define MAXBSEQLEN BIGINT#define MAXDTAGLEN 16#define MAXDTEMPLEN 16#define MAXENTLVL ((ALLOC_MAX/sizeof(struct source)) - 1)#define MAXGRPCNT MAXGRPGTCNT/* Must be between 96 and 253 */#define MAXGRPGTCNT 253#define MAXGRPLVL MAXGRPGTCNT#define MAXLITLEN BIGINT/* This guarantees that NAMELEN < LITLEN (ie there's always space for a namein a buffer intended for a literal.) */#define MAXNAMELEN (REFLITLEN - 1)#define MAXNORMSEP 2#define MAXPILEN BIGINT#define MAXTAGLEN BIGINT#define MAXTAGLVL ((ALLOC_MAX/sizeof(struct tag)) - 1)/* Table of quantity names.  Must match Q* in sgmldecl.h. */static char *quantity_names[] = {    "ATTCNT",       "ATTSPLEN",     "BSEQLEN",      "DTAGLEN",      "DTEMPLEN",     "ENTLVL",       "GRPCNT",       "GRPGTCNT",     "GRPLVL",       "LITLEN",       "NAMELEN",      "NORMSEP",      "PILEN",        "TAGLEN",       "TAGLVL",    };static int max_quantity[] = {    MAXATTCNT,    MAXATTSPLEN,    MAXBSEQLEN,    MAXDTAGLEN,    MAXDTEMPLEN,    MAXENTLVL,    MAXGRPCNT,    MAXGRPGTCNT,    MAXGRPLVL,    MAXLITLEN,    MAXNAMELEN,    MAXNORMSEP,    MAXPILEN,    MAXTAGLEN,    MAXTAGLVL,};static char *quantity_changed;/* Non-zero means the APPINFO parameter was not NONE. */static int appinfosw = 0;struct sgmldecl sd = {     REFCAPSET,			/* capacity */#ifdef SUPPORT_SUBDOC     MAXNUMBER,			/* subdoc */#else /* not SUPPORT_SUBDOC */     0,				/* subdoc */#endif /* not SUPPORT_SUBDOC */     1,				/* formal */     1,				/* omittag */     1,				/* shorttag */     1,				/* shortref */     { 1, 0 },			/* general/entity name case translation */     {				/* reference quantity set */	  REFATTCNT,	  REFATTSPLEN,	  REFBSEQLEN,	  REFDTAGLEN,	  REFDTEMPLEN,	  REFENTLVL,	  REFGRPCNT,	  REFGRPGTCNT,	  REFGRPLVL,	  REFLITLEN,	  REFNAMELEN,	  REFNORMSEP,	  REFPILEN,	  REFTAGLEN,	  REFTAGLVL,     },};static int systemcharset[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,};/* This is a private use designating sequence that by conventionrefers to the whole system character set whatever it is. */#define SYSTEM_CHARSET_DESIGNATING_SEQUENCE "ESC 2/5 2/15 3/0"static struct pmap charset_map[] = {     { "ESC 2/5 4/0", (UNIV)iso646charset }, /* ISO 646 IRV */     { "ESC 2/8 4/2", (UNIV)iso646G0charset }, /* ISO Registration Number 6, ASCII */     { "ESC 2/8 4/0", (UNIV)iso646G0charset }, /* ISO Registration Number 6, ASCII */     { "ESC 2/13 4/1", (UNIV)iso8859_1charset }, /* Latin 1 */     { "ESC 2/1 4/0", (UNIV)iso646C0charset }, /* ISO 646, C0 */     { "ESC 2/2 4/3", (UNIV)iso6429C1charset }, /* ISO 6429, C1 */     { SYSTEM_CHARSET_DESIGNATING_SEQUENCE, (UNIV)systemcharset },				/* system character set */     { 0 }};static int synrefcharset[256];	/* the syntax reference character set */#define CHAR_NONSGML 01#define CHAR_SIGNIFICANT 02#define CHAR_MAGIC 04#define CHAR_SHUNNED 010static UNCH char_flags[256];static int done_nonsgml = 0;static UNCH *nlextoke = 0;	/* new lextoke */static UNCH *nlextran = 0;	/* new lextran */#define MAX_SAVED_ERRS 4static UNIV saved_errs[MAX_SAVED_ERRS];static int nsaved_errs = 0;static UNCH kcharset[] = "CHARSET";static UNCH kbaseset[] = "BASESET";static UNCH kdescset[] = "DESCSET";static UNCH kunused[] = "UNUSED";static UNCH kcapacity[] = "CAPACITY";static UNCH kpublic[] = "PUBLIC";static UNCH ksgmlref[] = "SGMLREF";static UNCH kscope[] = "SCOPE";static UNCH kdocument[] = "DOCUMENT";static UNCH kinstance[] = "INSTANCE";static UNCH ksyntax[] = "SYNTAX";static UNCH kswitches[] = "SWITCHES";static UNCH kfeatures[] = "FEATURES";static UNCH kminimize[] = "MINIMIZE";static UNCH kdatatag[] = "DATATAG";static UNCH komittag[] = "OMITTAG";static UNCH krank[] = "RANK";static UNCH kshorttag[] = "SHORTTAG";static UNCH klink[] = "LINK";static UNCH ksimple[] = "SIMPLE";static UNCH kimplicit[] = "IMPLICIT";static UNCH kexplicit[] = "EXPLICIT";static UNCH kother[] = "OTHER";static UNCH kconcur[] = "CONCUR";static UNCH ksubdoc[] = "SUBDOC";static UNCH kformal[] = "FORMAL";static UNCH kyes[] = "YES";static UNCH kno[] = "NO";static UNCH kappinfo[] = "APPINFO";static UNCH knone[] = "NONE";static UNCH kshunchar[] = "SHUNCHAR";static UNCH kcontrols[] = "CONTROLS";static UNCH kfunction[] = "FUNCTION";static UNCH krs[] = "RS";static UNCH kre[] = "RE";static UNCH kspace[] = "SPACE";static UNCH knaming[] = "NAMING";static UNCH klcnmstrt[] = "LCNMSTRT";static UNCH kucnmstrt[] = "UCNMSTRT";static UNCH klcnmchar[] = "LCNMCHAR";static UNCH kucnmchar[] = "UCNMCHAR";static UNCH knamecase[] = "NAMECASE";static UNCH kdelim[] = "DELIM";static UNCH kgeneral[] = "GENERAL";static UNCH kentity[] = "ENTITY";static UNCH kshortref[] = "SHORTREF";static UNCH knames[] = "NAMES";static UNCH kquantity[] = "QUANTITY";#define sderr mderrstatic UNIV pmaplookup P((struct pmap *, char *));static UNCH *ltous P((long));static VOID sdfixstandard P((UNCH *, int));static int sdparm P((UNCH *, struct parse *));static int sdname P((UNCH *, UNCH *));static int sdckname P((UNCH *, UNCH *));static int sdversion P((UNCH *));static int sdcharset P((UNCH *));static int sdcsdesc P((UNCH *, int *));static int sdpubcapacity P((UNCH *));static int sdcapacity P((UNCH *));static int sdscope P((UNCH *));static VOID setlexical P((void));static VOID noemptytag P((void));static int sdpubsyntax P((UNCH *));static int sdsyntax P((UNCH *));static int sdxsyntax P((UNCH *));static int sdtranscharnum P((UNCH *));static int sdtranschar P((int));static int sdshunchar P((UNCH *));static int sdsynref P((UNCH *));static int sdfunction P((UNCH *));static int sdnaming P((UNCH *));static int sddelim P((UNCH *));static int sdnames P((UNCH *));static int sdquantity P((UNCH *));static int sdfeatures P((UNCH *));static int sdappinfo P((UNCH *));static VOID sdsaverr P((UNS, UNCH *, UNCH *));static VOID bufsalloc P((void));static VOID bufsrealloc P((void));/* Parse the SGML declaration. Return non-zero if there was some appinfo. */int sgmldecl(){     int i;     int errsw = 0;     UNCH endbuf[REFNAMELEN+2];	/* buffer for parsing terminating > */     static int (*section[]) P((UNCH *)) = {	  sdversion,	  sdcharset,	  sdcapacity,	  sdscope,	  sdsyntax,	  sdfeatures,	  sdappinfo,     };     /* These are needed if we use mderr. */     parmno = 0;     mdname = sgmlkey;     subdcl = NULL;     nsaved_errs = 0;     for (i = 0; i < SIZEOF(section); i++)	  if ((*section[i])(tbuf) == FAIL) {	       errsw = 1;	       break;	  }     if (sd.formal) {	  /* print saved errors */	  int i;	  for (i = 0; i < nsaved_errs; i++)	       svderr(saved_errs[i]);     }     else {	  /* free saved errors */	  int i;	  for (i = 0; i < nsaved_errs; i++)	       msgsfree(saved_errs[i]);     }     if (!errsw)	  setlexical();     bufsrealloc();     /* Parse the >.  Don't overwrite the appinfo. */     if (!errsw)	  sdparm(endbuf, 0);     /* We must exit if we hit end of document. */     if (pcbsd.action == EOD_)	  exiterr(161, &pcbsd);     if (!errsw && pcbsd.action != ESGD)	  sderr(126, (UNCH *)0, (UNCH *)0);     return appinfosw;}/* Parse the literal (which should contain the version of thestandard) at the beginning of a SGML declaration. */static int sdversion(tbuf)UNCH *tbuf;{     if (sdparm(tbuf, &pcblitv) != LIT1) {	  sderr(123, (UNCH *)0, (UNCH *)0);	  return FAIL;     }     sdfixstandard(tbuf, 0);     if (ustrcmp(tbuf, standard) != 0)	  sderr(E_BADVERSION, tbuf, standard);     return SUCCESS;}/* Parse the CHARSET section. Use one token lookahead. */static int sdcharset(tbuf)UNCH *tbuf;{     int i;     int status[256];     if (sdname(tbuf, kcharset) == FAIL) return FAIL;     (void)sdparm(tbuf, 0);     if (sdcsdesc(tbuf, status) == FAIL)	  return FAIL;#if 0     for (i = 128; i < 256; i++)	  if (status[i] != UNDESC)	       break;     if (i >= 256) {	  /* Only a 7-bit character set was described.  Fill it out to 8-bits. */	  for (i = 128; i < 256; i++)	       status[i] = UNUSED;#if 0	  sderr(E_7BIT, (UNCH *)0, (UNCH *)0);#endif     }#endif     /* Characters that are declared UNUSED in the document character set	are assigned to non-SGML. */     for (i = 0; i < 256; i++) {	  if (status[i] == UNDESC) {#if 0	       sderr(E_CHARMISSING, ltous((long)i), (UNCH *)0);#endif	       char_flags[i] |= CHAR_NONSGML;	  }	  else if (status[i] == UNUSED)	       char_flags[i] |= CHAR_NONSGML;     }     done_nonsgml = 1;     return SUCCESS;}/* Parse a character set description.   Uses one character lookahead. */static int sdcsdesc(tbuf, status)UNCH *tbuf;int *status;{     int i;     int nsets = 0;     struct fpi fpi;     for (i = 0; i < 256; i++)	  status[i] = UNDESC;     for (;;) {	  int nchars;	  int *baseset = 0;	  if (pcbsd.action != NAS1) {	       if (nsets == 0) {		    sderr(120, (UNCH *)0, (UNCH *)0);		    return FAIL;	       }	       break;	  }	  if (!matches(tbuf, kbaseset)) {	       if (nsets == 0) {		    sderr(118, tbuf+1, kbaseset);		    return FAIL;	       }	       break;	  }	  nsets++;	  MEMZERO((UNIV)&fpi, FPISZ);	  if (sdparm(tbuf, &pcblitv) != LIT1) {	       sderr(123, (UNCH *)0, (UNCH *)0);	       return FAIL;	  }	  fpi.fpipubis = tbuf;	  /* Give a warning if it is not a CHARSET fpi. */	  if (parsefpi(&fpi))	       sdsaverr(E_FORMAL, (UNCH *)0, (UNCH *)0);	  else if (fpi.fpic != FPICHARS)	       sdsaverr(E_BADCLASS, kcharset, (UNCH *)0);	  else {	       fpi.fpipubis[fpi.fpil + fpi.fpill] = '\0';	       baseset = (int *)pmaplookup(charset_map,					   (char *)fpi.fpipubis + fpi.fpil);	       if (!baseset)		    sderr(E_UNKNOWNSET, fpi.fpipubis + fpi.fpil, (UNCH *)0);	  }	  if (sdname(tbuf, kdescset) == FAIL) return FAIL;	  nchars = 0;	  for (;;) {	       long start, count;	       long basenum;	       if (sdparm(tbuf, 0) != NUM1)		    break;	       start = atol((char *)tbuf);	       if (sdparm(tbuf, 0) != NUM1) {		    sderr(E_XNUM, (UNCH *)0, (UNCH *)0);		    return FAIL;	       }	       count = atol((char *)tbuf);	       switch (sdparm(tbuf, &pcblitv)) {	       case NUM1:		    basenum = atol((char *)tbuf);		    break;	       case LIT1:		    basenum = UNKNOWN;		    break;	       case NAS1:		    if (matches(tbuf, kunused)) {			 basenum = UNUSED;			 break;		    }		    /* fall through */	       default:		    sderr(E_CHARDESC, ltous(start), (UNCH *)0);		    return FAIL;	       }	       if (start + count > 256)		    sderr(E_CHARRANGE, (UNCH *)0, (UNCH *)0);	       else {		    int i;		    int lim = (int)start + count;		    for (i = (int)start; i < lim; i++) {			 if (status[i] != UNDESC)			      sderr(E_CHARDUP, ltous((long)i), (UNCH *)0);			 else if (basenum == UNUSED || basenum == UNKNOWN)			      status[i] = (int)basenum;			 else if (baseset == 0)			      status[i] = UNKNOWN_SET;			 else {			      int n = basenum + (i - start);			      if (n < 0 || n > 255)				   sderr(E_CHARRANGE, (UNCH *)0, (UNCH *)0);			      else {				   if (baseset[n] == UNUSED)					sderr(E_BADBASECHAR, ltous((long)n),					      (UNCH *)0);				   status[i] = baseset[n];			      }			 }		    }	       }	       nchars++;	  }	  if (nchars == 0) {	       sderr(E_XNUM, (UNCH *)0, (UNCH *)0);	       return FAIL;	  }     }     return SUCCESS;}/* Parse the CAPACITY section.  Uses one token lookahead. */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -