📄 htmlcontrol.c

📁 将HTML转换为TXT文件的程序
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
    if (c == '<') {      /*       * Examine the first character of the tag.       */      c = get_char();      if (c == '!') {        c = get_char();        if (c == '-') {          c = get_char();          if (c != '-') return SCAN_ERROR;          /*	   * This is a comment... skip it!	   *	   *   <!-- Single-line comment -->	   *	   *   <!-- Multi-	   *        line	   *        comment //-->	   *	   * EXTENSION: Allow "-->" as the terminator of a multi-line comment.	   */          int state = 0;          do {            c = get_char();            if (c == EOF) return SCAN_ERROR;            switch (state) {            case 0: if (c == '-') state = 1; break;            case 1: state = c == '-' ? 2 : 0; break;            case 2: state = c == '>' ? 3 : c == '-' ? 2 : 0; break;            }          } while (state != 3);          continue;   // Start over        }        /*         * Scan "<!DOCTYPE ...>" tag.         */        if (!isalpha(c)) return SCAN_ERROR;        string tag_name(1, '!');        tag_name += c;        for (;;) {          c = get_char();          if (!isalnum(c) && c != '-') break;          tag_name += c;        }        if (cmp_nocase(tag_name, "!DOCTYPE") != 0) return SCAN_ERROR;        while (c != '>') {          c = get_char();          if (c == EOF || c == '\n') return SCAN_ERROR;        }        return DOCTYPE;      }      if (c == '/' || isalpha(c) || c == '_') {        string tag_name;        bool   is_end_tag = false;        if (c == '/') { is_end_tag = true; c = get_char(); }        if (!isalpha(c) && c != '_') return SCAN_ERROR;        tag_name += c;        for (;;) {          c = get_char();          if (!isalnum(c) && c != '-' && c != '_') break;          tag_name += c;        }        while (isspace(c)) c = get_char();        /*         * Scan tag attributes (only for opening tags). Create the         * "tag_attributes" only on demand; this saves a lot of overhead.         */        auto_ptr<list<TagAttribute> > tag_attributes;        if (!is_end_tag) {          while (isalpha(c) || c == '_') {            TagAttribute attribute;            /*             * Scan attribute name.             */            attribute.first = c;            for (;;) {              c = get_char();              if (!isalpha(c) && c != '-' && c != '_') break;              attribute.first += c;            }            while (isspace(c)) c = get_char(); // Skip WS after attribute name            /*             * Scan (optional) attribute value.             */            if (c == '=') {              c = get_char();              while (isspace(c)) c = get_char();              if (c == '"' || c == '\'') {                int closing_quote = c;   // Same as opening quote!                for (;;) {                  c = get_char();                  if (c == EOF || c == '\n') return SCAN_ERROR;                  if (c == closing_quote) break;                  /*                   * Do *not* interpret "&auml;" and consorts here! This                   * would ruin tag attributes like "HREF=hhh?a=1&b=2".                   */                  attribute.second += c;                }                c = get_char();    // Get next char after closing quote.              } else              while (c != '>' && c > ' ') {                if (c == EOF || c == '\n') return SCAN_ERROR;                attribute.second += c;                c = get_char();              }              while (isspace(c)) c = get_char();   // Skip WS after attr value            }            /*             * Store the attribute.             */            if (!tag_attributes.get()) {              tag_attributes.reset(new list<TagAttribute>);            }            tag_attributes->push_back(attribute);          }        }        if (c != '>') return SCAN_ERROR;        if (debug_scanner) {          cerr << "Scanned tag \"<" << (is_end_tag ? "/" : "") << tag_name;          if (!is_end_tag && tag_attributes.get()) {	    const list<TagAttribute>           &ta(*tag_attributes);            list<TagAttribute>::const_iterator j;            for (j = ta.begin(); j != ta.end(); ++j) {	      cerr << " " << (*j).first << "=\"" << (*j).second << "\"";            }          }          cerr << ">\"" << endl;        }        /*         * Look up the tag in the table of recognized tags.         */	static int (*const f)(const char *, const char *) = cmp_nocase;        const TextToIntP *tag = (const TextToIntP *) bsearch(          tag_name.c_str(),          tag_names, nelems(tag_names), sizeof(TextToIntP),          (int (*)(const void *, const void *)) f        );        if (tag == NULL) { /* EXTENSION: Swallow unknown tags. */          if (debug_scanner) {            cerr << "Tag unknown -- swallowed." << endl;          }          continue;        }        /*         * Return the BISON token code for the tag.         */        if (is_end_tag) {          if (!tag->end_tag_code) {            if (debug_scanner) {              cerr << "Non-container end tag scanned." << endl;            }            continue;          }          *tag_type_return = tag->block_tag ? BLOCK_END_TAG : END_TAG;          return *tag->end_tag_code;        } else {          *tag_type_return = (            !tag->end_tag_code ? NON_CONTAINER_TAG :            tag->block_tag     ? BLOCK_START_TAG   : START_TAG          );          value_return->tag_attributes = tag_attributes.release();          return *tag->start_tag_code;        }      }      /*       * EXTENSION: This tag did not match "<!", and not "</", and not       * "<[A-Za-z-]", so take it as literal text.       */      unget_char(c);      c = '<';    }    if (c == '\n' || c >= ' ') {      string *s = value_return->strinG = new string;      while (c != EOF) {        /*         * Accept literal '<' in some cases.         */        if (c == '<') {          int c2;          unget_char(c2 = get_char());          if (c2 == '!' || c2 == '/' || isalpha(c2)) { unget_char(c); break; }        }        *s += c;        c = get_char();      }      replace_sgml_entities(s);  // Replace "&auml;" and consorts.      /*       * Swallow empty PCDATAs.       */      if (s->empty()) { delete s; continue; }      if (debug_scanner) cerr << "Scanned PCDATA \"" << *s << "\"" << endl;      return PCDATA;    }        return SCAN_ERROR;  }}/* ------------------------------------------------------------------------- */boolHTMLControl::read_cdata(const char *terminal, string *value_return){  string &s(*value_return);  int    c;  int    state = 0;  for (;;) {    c = get_char();    if (c == EOF) return false;    if (toupper(c) == terminal[state]) {      state++;      if (terminal[state] == '\0') {        s.erase(s.length() - state);        return true;      }    } else {      state = 0;    }    s += c;  }}/* ------------------------------------------------------------------------- */intHTMLControl::get_char(){  if (number_of_ungotten_chars > 0) {    return ungotten_chars[--number_of_ungotten_chars];  }  int c = is.get();  while (c == '\r') c = is.get();  if (c == EOF) {    ;  } else  if (c == '\n') {    current_line++;    current_column = 0;  } else {    current_column++;  }  return c;}/* ------------------------------------------------------------------------- */voidHTMLControl::unget_char(int c){  if (number_of_ungotten_chars == nelems(ungotten_chars)) {    yyerror("Too many chars ungotten");    return;  }  ungotten_chars[number_of_ungotten_chars++] = c;}/* ------------------------------------------------------------------------- */
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -