⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pars2.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 4 页
字号:
}/* PARSNGRP: Parse notation names, get their dcncbs, and form an array of             pointers to them.  The array is terminated by a NULL pointer.             The number of pointers (including the NULL) is returned.             The grp buffer must have room for GRPCNT+1 members.*/UNS parsngrp(grp, pcb, tbuf)struct dcncb *grp[];          /* Buffer for building the group. */struct parse  *pcb;           /* Current parse control block. */UNCH *tbuf;{     int grpcnt = 0;          /* Number of members in the group. */     int i;     int essv = es;           /* Entity stack level when grp started. */     while (parse(pcb)!=GRPE && grpcnt<GRPCNT) {          switch (pcb->action) {          case NAS_:          /* Member name: get its control block. */               grp[grpcnt] = dcndef(parsenm(tbuf, NAMECASE));	       for (i = 0; i < grpcnt; i++)		    if (grp[i] == grp[grpcnt]) {			 mderr(98, ntoa(grpcnt + 1), grp[grpcnt]->ename + 1);			 break;		    }	       if (i == grpcnt)		    grpcnt++;               continue;          case EE_:           /* Entity ended (correctly or incorrectly). */               if (es<essv) {synerr(37, pcb); essv = es;}               continue;          case PIE_:          /* PI entity reference (invalid). */               entpisw = 0;   /* Reset PI entity indicator. */               synerr(59, pcb);               continue;          default:               break;          }          break;     }     grp[grpcnt++] = 0;       /* NULL pointer indicates end of group. */     if (es!=essv) synerr(37, pcb);     return grpcnt;           /* Return number of ptrs in group. */}/* COPYGRP: Allocate storage for a group and copy the group into it.*/PETD *copygrp(pg, grpsz)PETD pg[];                    /* Pointer to a group (array of etd ptrs). */UNS grpsz;                    /* Number of ptrs in grp, including final NULL. */{     UNS glen;                /* Group length in characters. */     PETD *gnm;               /* Ptr to permanent name group. */     if (pg==0) return (PETD *)0;     glen = grpsz * sizeof(struct etd *);     memcpy( (UNIV)(gnm = (struct etd **)rmalloc(glen)) , (UNIV)pg, glen );     return gnm;}/* INGRP: Locate an etd in a name group and return its index+1 (or zero          if not found).*/int ingrp(pg, ketd)PETD pg[];                    /* Array of pointers to etds. */PETD ketd;                    /* Pointer to etd to be found in group. */{     int i = 0;               /* Array index. */     while (pg[i]) if (pg[i++]==ketd) return i;     return 0;}/* PARSELIT: Parse a delimited string and collect it into a token.             Caller supplies buffer, which must be 1 longer than             maximum string allowed.             Caller also supplies character that delimits the string.             TODO: Return 1 if CDATA, SDATA or NONSGML occurred.*/#ifdef USE_PROTOTYPESVOID parselit(UNCH *tbuf, struct parse *pcb, UNS maxlen, UNCH del)#elseVOID parselit(tbuf, pcb, maxlen, del)UNCH *tbuf;                   /* Work area for tokenization (parmlen+1). */struct parse *pcb;            /* Current parse control block. */UNS maxlen;                   /* Maximum length of token. */UNCH del;                     /* Literal delimiter: LIT LITA PIC EOS */#endif{     UNCH *pt = tbuf;         /* Current pointer into tbuf. */     UNCH lexsv = pcb->plex[del];/* Saved value of delimiter in lexical table. */     int essv = es;           /* Entity stack level when literal started. */     UNCH datadel;            /* Delimiter for CDATA/SDATA entity. */     int parmlen = (int)maxlen + 1;  /* Working limit (to be decremented). */     int overflow = 0;	      /* Did the buffer overflow? */     pcb->plex[del] = pcb->plex == lexlms ? lex.l.litc : lex.l.minlitc;     /* The RPR_ action may cause the length of the literal to decrease by	1 (this discards a final space in a minimum literal); so while	building the literal, the length must be allowed to grow to	maxlen + 1. */     do {          switch (parse(pcb)) {               case LP2_:          /* Move 2nd char back to buffer; redo prev.*/                    REPEATCC;               case LPR_:          /* Move previous char to buffer; REPEATCC; */                    REPEATCC;               case MLA_:          /* Move character to buffer. */		    if (parmlen <= 0) { overflow = 1; break; }                    *pt++ = *FPOS; --parmlen;                    continue;               case FUN_:          /* Function char found; replace with space.*/		    if (parmlen <= 0) { overflow = 1; break; }                    *pt++ = ' '; --parmlen;                    continue;               case RSM_:          /* Record start: ccnt=0; ++rcnt.*/		    ++RCNT; CTRSET(RSCC); 		    if (parmlen <= 0) { overflow = 1; break; }                    *pt++ = *FPOS; --parmlen;                    continue;               case ERX_:          /* Entity reference: cancel LITC delim. */               case PEX_:          /* Parameter entity ref: cancel LITC delim.*/                    lexlms[del] = lexsv;                    continue;               case EE_:                    if (es<essv) {                         synerr(37, pcb);                         essv = es;                    }                    /* If back at top level, re-enable the LITC delimiter. */                    if (es==essv) lexlms[del] = lex.l.litc;                    continue;               case MLE_:          /* Char not allowed in minimum literal. */                    synerr(63, pcb);                    continue;               case DEF_:          /* Data entity: add it to buffer. */		    if (pcb == &pcblitt) {			 int parmlensv = parmlen;			 entdatsw = 0;			 parmlen = tokdata(pt, parmlen);			 if (parmlen < 0)			      break;			 pt += parmlensv - parmlen;			 continue;		    }		    if (parmlen < datalen + 2) {			 entdatsw = 0;			 overflow = 1;			 break;		    }		    parmlen -= datalen + 2;                    *pt++ = datadel =                         BITON(entdatsw, CDECONT) ? DELCDATA : DELSDATA;                    entdatsw = 0;                    memcpy( pt , data, datalen );                    pt += datalen;                    *pt++ = datadel;                    continue;               case NON_:          /* Non-SGML char (delimited and shifted). */		    if (parmlen < 2) { overflow = 1; break; }		    parmlen -= 2;                    memcpy( pt , nonchbuf, 2 );                    pt += 2;                    continue;               case RPR_:          /* Remove character from buffer. */                    --pt; ++parmlen;                    break;               case EOD_:                    exiterr(92, pcb);               default:                    break;          }          break;     } while (!overflow && pcb->action!=TER_);     if (parmlen <= 0) {	  --pt;	  overflow = 1;     }     if (overflow)	  sgmlerr(134, pcb, ntoa((int)maxlen),(UNCH *)0);     datalen = (UNS)(pt-tbuf);/* To return PI string to text processor. */     *pt++ = EOS;     pcb->plex[del] = lexsv;     /* Restore normal delimiter handling. */     if (es!=essv) synerr(37, pcb);}/* Handle a data entity in a tokenized attribute value literal.Parmlen is amount of space left.  Return new parmlen. If there's notenough space return -1, and copy up to parmlen + 1 characters.  Onlytokenization should be done, not attribute value interpretation. */int tokdata(pt, parmlen)UNCH *pt;int parmlen;{     int skip = (pcblitt.newstate == 0);     int i;          for (i = 0; parmlen >= 0 && i < datalen; i++) {	  switch (data[i]) {	  case SPCCHAR:	       if (!skip) {		    *pt++ = data[i];		    parmlen--;		    skip = 1;	       }	       break;	  default:	       if (data[i] == DELNONCH) {		    assert(i + 1 < datalen);		    if ((parmlen -= 2) < 0)			 break;		    *pt++ = DELNONCH;		    *pt++ = data[++i];		    skip = 0;	       }	       else {		    *pt++ = data[i];		    parmlen--;		    skip = 0;	       }	       break;	  }     }     pcblitt.newstate = skip ? 0 : pcblittda;     return parmlen;}/* PARSEMD: Parser for markup declarations.            It returns a token each time it is called.*/int parsemd(pt, namecase, lpcb, tokenlen)UNCH *pt;                     /* Token buffer: >=tokenlen+2. */int namecase;                 /* Case translation: ENTCASE NAMECASE AVALCASE. */struct parse *lpcb;           /* Parse control block for literal parse. */UNS tokenlen;                 /* Max length of expected token: NAMELEN LITLEN */{     struct parse *pcb;       /* Current parse control block. */     pcb = (lpcb) ? &pcbmd : &pcbmdc;  /* If no literal pcb, dcl is comment. */     doparse: while (parse(pcb)==EE_)          if (es<mdessv) {synerr(37, pcb); mdessv = es;}     if (pcb->action==PIE_) { /* PI entity reference not allowed. */          entpisw = 0;        /* Reset PI entity indicator. */          synerr(59, pcb);          goto doparse;     }     ++parmno;           /* Increment parameter counter. */     switch (pcb->action) {     case CDR:           /* COM[1] (MINUS) occurred previously. */          REPEATCC;          return (int)pcb->action;     case LIT:           /* Literal: CDATA with LIT delimiter. */          parselit(pt, lpcb, tokenlen, lex.d.lit);          return (int)pcb->action;     case LITE:          /* Literal: CDATA with LITA delimiter. */          parselit(pt, lpcb, tokenlen, lex.d.lita);          return((int)(pcb->action = LIT));     case RNS:           /* Reserved name started (after RNI). */          parsenm(pt, NAMECASE);          return (int)pcb->action;     case NAS:           /* Name started. */          if (namecase!=AVALCASE) {               parsenm(pt, namecase);               return (int)pcb->action;          }          /* Treat attribute value as name character string. */     case NMT:           /* Name token string. */          parsetkn(pt, NMC, (int)tokenlen);  /* Get undelimited value. */          return (int)pcb->action;     case NUM:           /* Number or number token string. */          parsetkn(pt, (UNCH)((int)tokenlen<=NAMELEN ? NU:NMC), (int)tokenlen);	  if (tokenlen > NAMELEN) pcb->newstate = 0;          return (int)pcb->action;     case PENR:	  REPEATCC;	  return (pcb->action = PEN);     case EOD_:          exiterr(133, pcb);          /* EXIT */     default:            /* End of declaration. */          return (int)pcb->action; /* EMD GRPS MGRP PEN PGRP */     }}/* PARSEMOD: If the declared content was a keyword, the token count is zero             and it is only necessary to save the type.  Otherwise,             collect the outermost token count and model type bytes for a model.             The count includes tokens found in nested groups also.             After building the model, parse for its occurrence indicator.*/struct thdr *parsemod(dctype)int dctype;                        /* Content type (0=model). */{     gbuf[0].ttype = (UNCH)dctype; /* Initialize content flags byte. */     if (dctype) {gbuf[0].tu.tnum = 0; return gbuf;} /* Return if not model. */     gbuf[0].tu.tnum = 0;          /* Don't count 1st group or model header. */     gbuf[1].ttype = 0;            /* Initialize 1st group type ... */     gbuf[1].tu.tnum = 0;          /* and count. */     grplvl = 1;                   /* Content model is 1st level group. */     pcbgrcm.newstate = 0;         /* Go parse the model group. */     /* Empty group is trapped during syntax parse; other errors return NULL. */     if (!parsegcm(&pcbgrcm, &gbuf[1], &gbuf[0])) return (struct thdr *)0;     parse(&pcbgrcs);             /* Get the model suffix, if there is one. */     switch(pcbgrcs.action) {     case OPT:                     /* OPT occurrence indicator for model. */          SET(gbuf[1].ttype, TOPT|TXOPT);          break;     case REP:                     /* REP occurrence indicator for model. */          SET(gbuf[1].ttype, TREP|TXREP);          break;     case OREP:                    /* OREP occurrence indicator for model. */          SET(gbuf[1].ttype, TOREP|TXOREP);          break;     case EE_:	  if (es < mdessv) {	       synerr(37, &pcbmd);	       mdessv = es;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -