⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pars2.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
#include "sgmlincl.h"         /* #INCLUDE statements for SGML parser. *//* PARSE: Parse a source input stream with specified lexical and state tables.          Return to caller with action code.*/int parse(pcb)struct parse *pcb;            /* Current parse control block. */{     int rc;                  /* Return code from ENTREF. */     while (1) {          NEWCC;          pcb->input = pcb->plex[*FPOS];          pcb->state = pcb->newstate;          pcb->newstate = (*(pcb->ptab + pcb->state)) [pcb->input];          pcb->action = (*(pcb->ptab + pcb->state + 1)) [pcb->input];          TRACEPCB(pcb);          switch (pcb->action) {          case RC2_:          /* Back up two characters. */               REPEATCC;          case RCC_:          /* Repeat current character. */               REPEATCC;          case NOP_:          /* No action necessary.*/               continue;          case RS_:           /* Record start: ccnt=0; ++rcnt.*/               ++RCNT; CTRSET(RSCC);               continue;          case GET_:          /* EOB or dull EOS or EE found: keep going.*/               if (entget()==-1) {pcb->action = EOD_; break;}/* Signal if EOD.*/               continue;          case EOF_:          /* Illegal entity end; return EE_. */               synerr(E_EOF, pcb);               pcb->action = EE_;          case EE_:           /* Important EOS or EE found: return to caller.*/               if (entget()==-1) pcb->action = EOD_;   /* Signal if EOD. */               break;          case PER_:          /* Parameter entity reference. */               REPEATCC;           /* Use PERO as 1st char of entity name. */               parsenm(entbuf, ENTCASE);               parse(&pcbref);     /* Handle REFC or other terminator. */               rc = entref(entbuf);               if (rc==ENTPI) {pcb->action = PIE_; break;}               continue;          case ER_:           /* General entity reference; continue. */               parsenm(entbuf, ENTCASE);               parse(&pcbref);     /* Handle REFC or other terminator. */	       rc = entref(entbuf);               if (rc==ENTDATA) {pcb->action = DEF_; break;}               if (rc==ENTPI) {pcb->action = PIE_; break;}               continue;          case PEX_:          /* Parameter entity reference; return. */               REPEATCC;           /* Use PERO as 1st char of entity name. */          case ERX_:          /* General entity reference; return. */               parsenm(entbuf, ENTCASE);               parse(&pcbref);     /* Handle REFC or other terminator. */               rc = entref(entbuf);               if (rc == ENTDATA){		    /* Reference to external data/subdoc entity in replaceable		       character data. */		    if (BITON(entdatsw, NDECONT)) {			 switch (((PNE)data)->nextype) {			 case ESNCDATA:			 case ESNSDATA:			      /* The standard says `non-SGML data entity'				 but the amendment should have changed it				 to `external data entity'. */			      synerr(145, pcb);			      break;			 case ESNNDATA:			 case ESNSUB:			      /* This is definitely illegal. */			      synerr(141, pcb);			      break;			 }			 entdatsw = 0;			 continue;		    }		    pcb->action = DEF_;	       }               else if (rc == ENTPI) {		    /* Reference to PI entity not allowed in replaceable		       character data. */		    synerr(59, pcb);		    entpisw = 0;		    continue;	       }               else if (rc) pcb->action = EE_;               break;          case CRN_:          /* Character reference: numeric. */               parsetkn(entbuf, NU, NAMELEN);               parse(&pcbref);     /* Handle reference terminator. */               pcb->action = charrefn(entbuf, pcb);               if (pcb->action==CRN_) continue;   /* Invalid reference */               break;          case CRA_:           /* Character reference: alphabetic. */               parsenm(entbuf, NAMECASE);               parse(&pcbref);     /* Handle reference terminator. */               charrefa(entbuf);	       if (docelsw) synerr(232, pcb);               continue;          case SYS_:          /* Invalid NONCHAR: send msg and ignore. */               synerr(E_SYS, pcb);	       if (*FPOS == DELNONCH) NEWCC;               continue;          case NON_:	      /* Valid NONCHAR: prefix and shift encoding. */               synerr(60, pcb);	       pcb->action = datachar(*FPOS, pcb);               break;	  case NSC_:               synerr(60, pcb);	       NEWCC;	       nonchbuf[1] = *FPOS;	       pcb->action = NON_;	       break;          case PCI_:          /* Previous character was invalid (INV_). */               REPEATCC;          case INV_:          /* Markup ended by invalid char; repeat char. */               synerr(9, pcb);               REPEATCC;               break;          case LNR_:          /* Previous char exceeded len; back up to it. */               REPEATCC;          case LEN_:          /* Token too long; ignore excess character. */               synerr(3, pcb);               continue;          case RCR_:          /* Repeat current char and return to caller. */               REPEATCC;          default:            /* Actions for specific parse. */               break;          }          return (int)pcb->action;     }}/* CHARREFA: Resolve an alphabetical reference to a function character             and put the character in the read buffer.             If reference is bad, issue an error message.*/VOID charrefa(r)UNCH *r;                      /* Undelimited char ref (with length and EOS). */{     UNCH thechar;     thechar = mapsrch(funtab, r+1);     if (thechar == 0)	  synerr(62, &pcbref);     else {          /* This isn't ideal, because the character position will still	     be wrong for one line. */	  if (thechar == RSCHAR) RCNT--;	  setcurchar(thechar);          REPEATCC;     }}/* Make the current character ch. */VOID setcurchar(ch)int ch;{     /* If we're reading directly from an internal entity, we can't	change the entity, since the entity might be referenced again.	So in this case we copy the entity.  This is inefficient, but	it will only happen in a case like this:		<!entity % amp "&">	<!entity e "x%amp;#SPACE;">		Usually character references will have been processed while the	entity was being defined.  */     if (*FPOS != ch) {	  if (!FILESW && !COPIEDSW) {	       UNCH *s = savestr(FBUF + 1);	       FPOS = s + (FPOS - FBUF - 1);	       FBUF = s - 1;	       COPIEDSW = 1;	  }	  *FPOS = ch;     }}/* CHARREFN: Resolve a numeric character reference.             If reference is bad, issue an error message.*/int charrefn(r, pcb)UNCH *r;                      /* Undelimited character reference. */struct parse *pcb;            /* Current parse control block. */{     int thechar;     thechar = atoi((char *)r);     if (thechar<0 || thechar>255) {          synerr(61, &pcbref);          return((int)pcb->action);     }     return datachar(thechar, pcb);}/* Return ch as a datachar.  If this a non-SGML character which mightconfuse the parser, shift it to a code that won't and place it in aspecial buffer which has DELNONCH in the preceding byte.  Otherwiseput it the read buffer. */int datachar(ch, pcb)int ch;struct parse *pcb;{     switch (ch) {     case EOS:     case EOFCHAR:     case EOBCHAR:     case GENRECHAR:     case DELCDATA:     case DELSDATA:     case DELNONCH:	  /* A potentially confusing character which must be prefixed	     with DELNONCH. */          nonchbuf[1] = SHIFTNON((UNCH)ch);          return NON_;     }     setcurchar(ch);     /* If in content, return DCE_ for element content, DAF_ for mixed.  */     /* If not content, it must be a literal parse, so return MLA_. */     if (pcb == conpcb) {	  if (pcb == &pcbcone)	       return DCE_;	  else {	       data = FPOS;	       /* Action for DAF_ will do REPEATCC. */	       NEWCC;	       return DAF_;	  }     }     else	  return MLA_;}/* INITATT: Initialize al with adl. */VOID initatt(adl)struct ad *adl;{     notadn = 0;              /* No NOTATION attribute yet. */     conrefsw = 0;            /* Assume no content reference att. */     /* Copy attribute definition list as a template. */     memcpy((UNIV)al, (UNIV)adl, (1+ADN(adl))*ADSZ);}/* PARSEATT: Parse attribute specification list.             Make a current copy of the attribute definition list             and update it with the user's specifications.             Indicate each attribute that was specified in the             list (as opposed to defaulted) by setting the ASPEC flag.             If no attributes were specified, return NULL.  Otherwise,             if in the prolog, make a permanent copy of the list and             return its pointer.  If not in the prolog, return al.*/struct ad *parseatt(adl, pt)struct ad *adl;               /* Attribute definition list. */UNCH *pt;                     /* Tokenization area: tbuf[TAGLEN+ATTSPLEN]. */{     UNCH *antvptr;     UNCH *nm = 0;            /* Pointer to saved name in tbuf (with length). */     int adn = -1;            /* Position of attribute in list (-1=empty). */     UNCH *tbuflim = pt + ATTSPLEN;     mdessv = es;             /* Save es for checking entity nesting. */     initatt(adl);     while (pt<=tbuflim) {          parse(&pcbstag);          switch (pcbstag.action) {          case NVS:                     /* Att name or value token found. */               parsenm(pt, NAMECASE);   /* Case translation wanted on name. */               pt += *(nm = pt);        /* Save name while pointing past it. */               continue;          case AVD:           /* Delimited value found. */          case AVDA:          /* Delimited value found (alternate delimiter). */               /* Find position (adn) of saved attribute name in list. */               adn = anmget((int)ADN(al), nm);               parselit(pt,			(adn == 0 || ADTYPE(al, adn) == ACHARS)			? &pcblitr			: &pcblitt,			LITLEN,			(pcbstag.action==AVD) ? lex.d.lit : lex.d.lita);	       if (adn == 0) {                    /* Error: unrecognized attribute name. */                    sgmlerr(13, &pcbstag, nm+1, pt);                    continue;               }               /* Tokenize and validate value; let it default if an error. */               /* Put value in list and bump ptr by the normalized length                  (which is always >= the actual length). */               if (!attval(1, pt, adn, adl)) pt += ADLEN(al,adn);	       continue;          case AVU:           /* Attribute value found: undelimited. */	       if (!sd.shorttag) sgmlerr(196, &pcbstag, (UNCH *)0, (UNCH *)0);	       parsetkn(pt, NMC, LITLEN);               /* Find position (adn) of saved attribute name in list. */               if ((adn = anmget((int)ADN(al), nm))==0) {                    /* Error: unrecognized attribute name. */                    sgmlerr(13, &pcbstag, nm+1, pt);                    continue;               }               /* Tokenize and validate value; let it default if an error. */               /* Put value in list and bump ptr by the normalized length                  (which is always >= the actual length). */               if (!attval(1, pt, adn, adl)) pt += ADLEN(al,adn);               continue;          case NASV:          /* Saved NVS was really an NTV. */               REPEATCC;           /* Put back next token starter. */               pt = nm;            /* Back up to NVS. */          case NTV:           /* Name token value found. */	       if (!sd.shorttag) sgmlerr(195, &pcbstag, (UNCH *)0, (UNCH *)0);               if (pcbstag.action==NTV) parsenm(pt, NAMECASE);               if ((adn = antvget((int)ADN(al), pt, &antvptr))==0) {                    /* Error: unrecognized name token value. */                    sgmlerr(74, &pcbstag, pt+1, (UNCH *)0);                    continue;               }               /* Validate value; let it default if an error. */               /* Put value in list and bump ptr by the normalized length

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -