⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pars2.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
	  }     default:                      /* RCR_: Repeat char and return. */          break;     }     if (sw.swambig) ambig();	   /* Check content model for ambiguity. */     return gbuf;}/* PARSEGCM: Collect token headers (struct thdr) into a group (array).             An etd is defined for each GI (if none exists) and its pointer is             stored in the header.  The function is called recursively.*/struct thdr *parsegcm(pcb, pgh, gbuf)struct parse *pcb;                 /* Current parse control block. */struct thdr *pgh;                  /* Current group header in group buffer. */struct thdr *gbuf;                 /* Header for outermost group (model). */{#define MCON gbuf->ttype           /* Model type (content attributes). */     struct thdr *pg=pgh;          /* Current group token. */     struct thdr *pgsv=pgh;        /* Saved current token for occ indicator. */     int optcnt = 0;               /* Count of optional tokens in group. */     int essv = es;                /* Entity stack level when grp started. */    while (gbuf->tu.tnum<=GRPGTCNT && pgh->tu.tnum<=GRPCNT && parse(pcb)!=GRPE)     switch (pcb->action) {     case NAS_:          /* GI name: get its etd and store it. */          ++gbuf->tu.tnum; ++pgh->tu.tnum;          (pgsv = ++pg)->ttype = TTETD;          pg->tu.thetd = etddef(parsenm(tbuf, NAMECASE));          SET(MCON, MGI);          continue;     case RNS_:          /* Reserved name started (#PCDATA). */          parsenm(tbuf, NAMECASE);          if (ustrcmp(tbuf+1, key[KPCDATA])) {               mderr(116, ntoa(gbuf->tu.tnum), tbuf+1);               return (struct thdr *)0;          }          /* If #PCDATA is the first non-group token, model is a phrase. */          if (!MCON) SET(MCON, MPHRASE);     case DTAG:          /* Data tag template ignored; treat as #PCDATA. */          if (pcb->action==DTAG) SET(pgh->ttype, TTSEQ); /* DTAG is SEQ grp. */          ++gbuf->tu.tnum; ++pgh->tu.tnum;          (++pg)->ttype = TTCHARS+TOREP;/* #PCDATA is OPT and REP. */          pg->tu.thetd = ETDCDATA;          ++optcnt;                     /* Ct opt tokens to see if grp is opt.*/          SET(MCON, MCHARS);          continue;     case GRP_:          /* Group started. */          ++gbuf->tu.tnum; ++pgh->tu.tnum;          (pgsv = ++pg)->ttype = 0;     /* Type will be set by connector. */          pg->tu.tnum = 0;              /* Group has number instead of etd. */          if (++grplvl>GRPLVL) {               mderr(115, ntoa(gbuf->tu.tnum), (UNCH *)0);               return (struct thdr *)0;          }          pg = parsegcm(pcb, pg, gbuf);          if (!pg) return (struct thdr *)0;          if (GET(pgsv->ttype, TOPT)) ++optcnt;  /* Indicate nested opt grp. */          --grplvl;          continue;     case OREP:          /* OREP occurrence indicator for current token.*/          SET(pgsv->ttype, TREP|TXREP);                         /* Now treat like OPT. */     case OPT:           /* OPT occurrence indicator for current token. */          SET(pgsv->ttype, TXOPT);          if (GET(pgsv->ttype, TOPT)) continue;  /* Exit if nested opt grp. */          SET(pgsv->ttype, TOPT);          ++optcnt;      /* Count opt tokens to see if grp is optional. */          continue;     case REP:           /* REP occurrence indicator for current token. */          SET(pgsv->ttype, TREP|TXREP);          continue;     case OR:            /* OR connector found. */          if BITOFF(pgh->ttype, TTAND) SET(pgh->ttype, TTOR);          else if (GET(pgh->ttype, TTAND)!=TTOR)               mderr(55, ntoa(gbuf->tu.tnum), (UNCH *)0);          continue;     case AND:           /* AND connector found. */          if BITOFF(pgh->ttype, TTAND) SET(pgh->ttype, TTAND);          else if (GET(pgh->ttype, TTAND)!=TTAND)               mderr(55, ntoa(gbuf->tu.tnum), (UNCH *)0);          continue;     case SEQ:           /* SEQ connector found. */          if BITOFF(pgh->ttype, TTAND) SET(pgh->ttype, TTSEQ);          else if (GET(pgh->ttype, TTAND)!=TTSEQ)               mderr(55, ntoa(gbuf->tu.tnum), (UNCH *)0);          continue;     case EE_:           /* Entity ended (correctly or incorrectly). */          if (es<essv) {synerr(37, pcb); essv = es;}          continue;     case PIE_:          /* PI entity reference (not permitted). */          entpisw = 0;   /* Reset PI entity indicator. */          synerr(59, pcb);          continue;     default:            /* Syntax errors return in disgrace. */          synerr(37, pcb);          return (struct thdr *)0;     }     if (pgh->tu.tnum>GRPCNT) {          mderr(113, ntoa(gbuf->tu.tnum), (UNCH *)0);          return (struct thdr *)0;     }     if (gbuf->tu.tnum>GRPGTCNT) {          mderr(114, ntoa(gbuf->tu.tnum), (UNCH *)0);          return (struct thdr *)0;     }     if (pgh->tu.tnum==1) SET(pgh->ttype, TTSEQ); /* Unit grp is SEQ. */     /* An optional token in an OR group makes the group optional. */     if (GET(pgh->ttype, TTMASK)==TTOR && optcnt) SET(pgh->ttype, TOPT);     /* If all tokens in any group are optional, so is the group. */     if (pgh->tu.tnum<=optcnt) SET(pgh->ttype, TOPT);     if (es!=essv) synerr(37, pcb);     return pg;                             /* Return pointer to GRPS token. */}/* PARSENM: Parser for SGML names, which can be translated with LEXTRAN.            The input is read from the entity stack.  CC is 1st char of name.            Returns a pointer to the parsed name.*/UNCH *parsenm(tbuf, nc)UNCH *tbuf;                   /* Buffer for name: >=NAMELEN+2. */int nc;                       /* Namecase translation: 1=yes; 0=no. */{     UNCH   len;              /* Length of name (incl EOS & length byte). */     *(tbuf + (len = 1) ) = nc ? lextran[*FPOS] : *FPOS;     while ((NEWCC, (int)lextoke[*FPOS]>=NMC) && (len<NAMELEN)) {          TRACETKN(NMC, lextoke);          if (lextoke[*(tbuf + ++len) = (nc ? lextran[*FPOS] : *FPOS)]==EOB) {               --len;               entget();          }     }     REPEATCC;                       /* Put back the non-token character. */     *(tbuf + ++len) = EOS;          /* Terminate name with standard EOS. */     *tbuf = ++len;                  /* Store length ahead of name. */     return tbuf;}/* PARSETKN: Parser for start-tag attribute value tokens.             First character of token is already in *FPOS.             Returns a pointer to the parsed token.	     Parsed token has EOS but no length byte.*/#ifdef USE_PROTOTYPESUNCH *parsetkn(UNCH *tbuf, UNCH scope, int maxlen)#elseUNCH *parsetkn(tbuf, scope, maxlen)UNCH *tbuf;		      /* Buffer for token: >=maxlen+1. */UNCH scope;		      /* Minimum lexical class allowed. */int maxlen;		      /* Maximum length of a token. */#endif{     int i = 1;     tbuf[0] = *FPOS;     while (i < maxlen) {	  NEWCC;	  if (lextoke[*FPOS] < scope) {	       REPEATCC;	       break;	  }          TRACETKN(scope, lextoke);	  if (*FPOS == EOBCHAR)	       entget();	  else	       tbuf[i++] = *FPOS;     }     tbuf[i] = EOS;     return tbuf;}/* PARSESEQ: Parser for blank sequences (i.e., space and TAB characters ).             First character of sequence is already in *FPOS.*/VOID parseseq(tbuf, maxlen)UNCH *tbuf;		      /* Buffer for storing found sequence. */int maxlen;		      /* Maximum length of a blank sequence. */{     tbuf[0] = *FPOS;     datalen = 1;     for (;;) {	  NEWCC;	  if (*FPOS == EOBCHAR) {	       entget();	       continue;	  }	  if ((lextoke[*FPOS] != SEP && *FPOS != SPCCHAR)	      || datalen >= maxlen)	       break;	  tbuf[datalen++] = *FPOS;	  TRACETKN(SEP, lextoke);     }}/* S2VALNM: Parser for attribute values that are tokenized like names.            The input is read from a string (hence S ("string") 2 ("to") VALNM).            It stops at the first bad character.            Returns a pointer to the created name.*/#ifdef USE_PROTOTYPESUNCH *s2valnm(UNCH *nm, UNCH *s, UNCH scope, int translate)#elseUNCH *s2valnm(nm, s, scope, translate)UNCH *nm;                     /* Name to be created. */UNCH *s;                      /* Source string to be parsed as name. */UNCH scope;                   /* Minimum lexical class allowed. */int translate;                /* Namecase translation: 1=yes; 0=no. */#endif{     UNCH len = 0;            /* Length of name (incl EOS and length). */     for (; (int)lextoke[*s] >= scope && len < NAMELEN; s++)	  nm[++len] = translate ? lextran[*s] : *s;     nm[++len] = EOS;         /* Terminate name with standard EOS. */     *nm = ++len;             /* Store length ahead of name. */     return nm;}/* PARSEVAL: Parser for attribute values.             The input is read from a string and tokenized in a buffer.             The input is terminated by EOS.             Each token is preceded by its actual length; there is no EOS.             If an error occurs while parsing, or             if a token doesn't conform, set the token count to 0 to show that             value was not tokenized and return the error code.             After successful parse, return buffer length and 0 error code.             The number of tokens found is set in external variable tokencnt.*/int parseval(s, atype, tbuf)UNCH *s;                      /* Source string to be parsed as token list. */UNS atype;                    /* Type of token list expected. */UNCH *tbuf;                   /* Work area for tokenization. */{     int t;     UNCH *pt = tbuf;     pcbval.newstate = 0; tokencnt = 0;     while (1) {          for (;;) {               pcbval.input = lextoke[*s];               pcbval.state = pcbval.newstate;               pcbval.newstate = (*(pcbval.ptab + pcbval.state)) [pcbval.input];               pcbval.action = (*(pcbval.ptab + pcbval.state+1)) [pcbval.input];               TRACEVAL(&pcbval, atype, s, tokencnt);	       if (pcbval.action != NOPA)		    break;	       s++;          }          switch (pcbval.action) {          case INVA:          /* Invalid character; terminate parse. */               if (*s == '\0') goto alldone;  /* Normal termination. */               tokencnt = 0;  /* Value was not tokenized. */               return(14);          case LENA:          /* Length limit of token exceeded; end parse. */               tokencnt = 0;  /* Value was not tokenized. */               return(15);          default:            /* Token begun: NUMA, NASA, or NMTA. */               break;          }          ++tokencnt;         /* One token per iteration. */          switch (atype) {          case AENTITY:               if (tokencnt>1) {tokencnt = 0; return(16);}          case AENTITYS:               if (pcbval.action!=NASA) {tokencnt = 0; return(17);}               s2valnm(pt, s, NMC, ENTCASE);               break;          case AID:          case AIDREF:          case ANAME:          case ANOTEGRP:               if (tokencnt>1) {tokencnt = 0; return(16);}          case AIDREFS:          case ANAMES:               if (pcbval.action!=NASA) {tokencnt = 0; return(17);}               s2valnm(pt, s, NMC, NAMECASE);               break;          case ANMTGRP:          case ANMTOKE:               if (tokencnt>1) {tokencnt = 0; return(16);}          case ANMTOKES:               /* No test needed because NMTA, NUMA and NASA are all valid. */               s2valnm(pt, s, NMC, NAMECASE);               break;          case ANUMBER:               if (tokencnt>1) {tokencnt = 0; return(16);}          case ANUMBERS:               if (pcbval.action!=NUMA) {tokencnt = 0; return(17);}               s2valnm(pt, s, NU, NAMECASE);	       t = lextoke[s[*pt - 2]];	       if (t == NMS || t == NMC) {tokencnt = 0; return(17);}               break;          case ANUTOKE:               if (tokencnt>1) {tokencnt = 0; return(16);}          case ANUTOKES:               if (pcbval.action!=NUMA) {tokencnt = 0; return(17);}               s2valnm(pt, s, NMC, NAMECASE);               break;          }	  *pt -= 2;	  s += *pt;	  pt += *pt + 1;     } alldone:     *pt++ = EOS;     if (*tbuf == '\0')	  return 25;     if (atype < ATKNLIST)	  *tbuf += 2;	      /* include length and EOS */     return 0;}/*Local Variables:c-indent-level: 5c-continued-statement-offset: 5c-brace-offset: -5c-argdecl-indent: 0c-label-offset: -5comment-column: 30End:*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -