📄 file.cc

📁 larbin是一种开源的网络爬虫/网络蜘蛛
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12
        if (state == HEADERS)          tmp = parseHeader();        else tmp = parseHeader30X();        if (tmp) {          return 1;        }        area = ++posParse;      } else {        return 0;      }      break;    case SPECIFIC:      return pipeSpec();    default:      return 0;    }  }  return 0;}/** parse the answer code line */int html::parseCmdline () {  if (posParse - buffer >= 12) {    switch (buffer[9]) {    case '2':      state = HEADERS;      break;    case '3':      state = HEADERS30X;      break;    default:      errno = err40X;      return 1;    }  } else {    errno = earlyStop;    return 1;  }  return 0;}/** parse a line of header * @return 0 if OK, 1 if we don't want to read the file */int html::parseHeader () {  if (posParse - area < 2) {	// end of http headers#ifndef FOLLOW_LINKS    state = SPECIFIC;#elif defined(SPECIFICSEARCH)    if (isInteresting) {      state = SPECIFIC;    } else {      state = HTML;    }#else // not a SPECIFICSEARCH    state = HTML;#endif // SPECIFICSEARCH    contentStart = posParse + 1;    *(posParse-1) = 0;    _newSpec();  } else {    *posParse = 0;    here->addCookie(area);    *posParse = '\n';    if (verifType ()) return 1;    if (verifLength()) return 1;  }  return 0;}/** function called by parseHeader * parse content-type * return 1 (and set errno) if bad type, 0 otherwise * can toggle isInteresting */#define errorType() errno=badType; return 1#ifdef ANYTYPE#define checkType() return 0#elif defined(IMAGES)#define checkType() if (startWithIgnoreCase("image", area+14)) { \    return 0; \  } else { errorType (); }#else#define checkType() errorType()#endifint html::verifType () {  if (startWithIgnoreCase("content-type: ", area)) {    // Let's read the type of this doc    if (!startWithIgnoreCase("text/html", area+14)) {#ifdef SPECIFICSEARCH      if (matchContentType(area+14)) {        interestingSeen();        isInteresting = true;      } else {        checkType();      }#else // SPECIFICSEARCH      checkType();#endif // SPECIFICSEARCH    }  }  return 0;}/** function called by parseHeader * parse content-length * return 1 (and set errno) if too long file, 0 otherwise */int html::verifLength () {#ifndef SPECIFICSEARCH  if (startWithIgnoreCase("content-length: ", area)) {    int len = 0;    char *p = area+16;    while (*p >= '0' && *p <= '9') {      len = len*10 + *p -'0';      p++;    }    if (len > maxPageSize) {      errno = tooBig;      return 1;    }  }#endif // SPECIFICSEARCH  return 0;}/** parse a line of header (ans 30X) => just look for location * @return 0 if OK, 1 if we don't want to read the file */int html::parseHeader30X () {  if (posParse - area < 2) {	// end of http headers without location => err40X    errno = err40X;    return 1;  } else {	if (startWithIgnoreCase("location: ", area)) {      int i=10;      while (area[i]!=' ' && area[i]!='\n' && area[i]!='\r'             && notCgiChar(area[i])) {        i++;      }      if (notCgiChar(area[i])) {        area[i] = 0; // end of url        // read the location (do not decrease depth)        url *nouv = new url(area+10, here->getDepth(), base);#ifdef URL_TAGS        nouv->tag = here->tag;#endif // URL_TAGS        manageUrl(nouv, true);        // we do not need more headers      }      errno = err30X;      return 1;	}  }  return 0;}/*********************************************//* This part manages the content of the file *//*********************************************//** file download is complete, parse the file (headers already done) * return 0 usually, 1 if there was an error */int html::endInput () {  if (state <= HEADERS) {    errno = earlyStop;    return 1;  }  if (state == HEADERS30X) {    errno = err40X;    return 1;  }#ifdef NO_DUP  if (!global::hDuplicate->testSet(posParse)) {    errno = duplicate;    return 1;  }#endif // NO_DUP  buffer[pos] = 0;  _endOfInput();  // now parse the html  parseHtml();  return 0;}/* parse an html page */void html::parseHtml () {  while ((posParse=strchr(posParse, '<')) != NULL) {    if (posParse[1] == '!') {      if (posParse[2] == '-' && posParse[3] == '-') {        posParse += 4;        parseComment();      } else {        // nothing...        posParse += 2;      }    } else {      posParse++;      parseTag();    }  }}/* skip a comment */void html::parseComment() {  while ((posParse=strchr(posParse, '-')) != NULL) {    if (posParse[1] == '-' && posParse[2] == '>') {      posParse += 3;      return;    } else {      posParse++;    }  }  posParse = buffer+pos;}/* macros used by the following functions */#define skipSpace() \  while (*posParse == ' ' || *posParse == '\n' \         || *posParse == '\r' || *posParse == '\t') { \    posParse++; \  }#define skipText() \  while (*posParse != ' ' && *posParse != '\n' && *posParse != '>' \         && *posParse != '\r' && *posParse != '\t' && *posParse != 0) { \    posParse++; \  }#define nextWord() skipText(); skipSpace()#define thisCharIs(i, c) (c == (posParse[i]|32))#define isTag(t, p, a, i) if (t) { \      param = p; \      action = a; \      posParse += i; \    } else { \      posParse++; \      return; \    }/** Try to understand this tag */void html::parseTag () {  skipSpace();  char *param=NULL; // what parameter are we looking for  int action=-1;  // read the name of the tag  if (thisCharIs(0, 'a')) { // a href    param = "href";    action = LINK;    posParse++;  } else if (thisCharIs(0, 'l')) {    isTag(thisCharIs(1, 'i') && thisCharIs(2, 'n') && thisCharIs(3, 'k'),          "href", LINK, 4);  } else if (thisCharIs(0, 'b')) { // base href    isTag(thisCharIs(1, 'a') && thisCharIs(2, 's') && thisCharIs(3, 'e'),          "href", BASE, 4);  } else if (thisCharIs(0, 'f')) { // frame src    isTag(thisCharIs(1, 'r') && thisCharIs(2, 'a')          && thisCharIs(3, 'm') && thisCharIs(4, 'e'),          "src", LINK, 5);#ifdef IMAGES  } else if (thisCharIs(0, 'i')) { // img src    isTag(thisCharIs(1, 'm') && thisCharIs(2, 'g'), "src", LINK, 3);#endif // IMAGES  } else {    return;  }  // now find the parameter  assert(param != NULL);  skipSpace();  for (;;) {    int i=0;    while (param[i]!=0 && thisCharIs(i, param[i])) i++;    posParse += i;    if (posParse[i]=='>' || posParse[i]==0) return;    if (param[i]==0) {      parseContent(action);      return;    } else {      // not the good parameter      nextWord();    }  }}/** read the content of an interesting tag */void html::parseContent (int action) {  posParse++;  while (*posParse==' ' || *posParse=='=') posParse++;  if (*posParse=='\"' || *posParse=='\'') posParse++;  area = posParse;  char *endItem = area + maxUrlSize;  if (endItem > buffer + pos) endItem = buffer + pos;  while (posParse < endItem && *posParse!='\"' && *posParse!='\''         && *posParse!='\n' && *posParse!=' ' && *posParse!='>'         && *posParse!='\r' && *posParse!='\t' && notCgiChar(*posParse)) {    if (*posParse == '\\') *posParse = '/';    // Bye Bye DOS !    posParse++;  }  if (posParse == buffer + pos) {    // end of file => content may be truncated => forget it    return;  } else if (posParse < endItem && notCgiChar(*posParse)) {    // compute this url (not too long and not cgi)    char oldchar = *posParse;    *posParse = 0;    switch (action) {    case LINK:      // try to understand this new link      manageUrl(new url(area, here->getDepth()-1, base), false);      break;    case BASE:      // This page has a BASE HREF tag      {        uint end = posParse - area - 1;        while (end > 7 && area[end] != '/') end--; // 7 because http://        if (end > 7) { // this base looks good          end++;          char tmp = area[end];          area[end] = 0;          url *tmpbase = new url(area, 0, (url *) NULL);          area[end] = tmp;          delete base;          if (tmpbase->isValid()) {            base = tmpbase;          } else {            delete tmpbase;            base = NULL;          }        }      }      break;    default: assert(false);    }    *posParse = oldchar;  }  posParse++;}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -