📄 htmlcontrol.c

📁 将HTML转换为TXT文件的程序
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* ------------------------------------------------------------------------- *//* * Copyright (c) 1999 *      GMRS Software GmbH, Innsbrucker Ring 159, 81669 Munich, Germany. *      http://www.gmrs.de *      All rights reserved. *      Author: Arno Unkrig (arno.unkrig@gmrs.de) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software *    must display the following acknowledgement: *      This product includes software developed by GMRS Software GmbH. * 4. The name of GMRS Software GmbH may not be used to endorse or promote *    products derived from this software without specific prior written *    permission. * * THIS SOFTWARE IS PROVIDED BY GMRS SOFTWARE GMBH ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GMRS SOFTWARE GMBH BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. *//* ------------------------------------------------------------------------- */#ident "$Id: HTMLControl.C,v 1.15 1999/12/08 20:36:12 arno Exp $"#include <iostream.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include "html.h"#include "HTMLControl.h"#include "sgml.h"#include "cmp_nocase.h"#ifndef nelems#define nelems(array) (sizeof(array) / sizeof((array)[0]))#endifenum {  NOT_A_TAG,  START_TAG, BLOCK_START_TAG,  END_TAG,   BLOCK_END_TAG,  NON_CONTAINER_TAG};/* ------------------------------------------------------------------------- *//* * Effectively, this method simply invokes "yylex2()", but it does some * postprocessing on PCDATA tokens that would be difficult to do in "yylex2()". */intHTMLControl::yylex(yy_HTMLParser_stype *value_return){  for (;;) { // Notice the "return" at the end of the body!    int token, tag_type;    if (next_token == EOF) {      token = yylex2(value_return, &tag_type);    } else {      token         = next_token;      *value_return = next_token_value;      tag_type      = next_token_tag_type;      next_token = EOF;    }    /*     * Switch on/off "literal mode" on "<PRE>" and "</PRE>".     */    if (token == PRE) {      literal_mode = true;      /*       * Swallow '\n' immediately following "<PRE>".       */      if (next_token == EOF) {        next_token = yylex2(&next_token_value, &next_token_tag_type);      }      if (next_token == PCDATA) {        string &s(*next_token_value.strinG);        if (!s.empty() && s[0] == '\n') s.erase(0, 1);      }    }    if (token == END_PRE) literal_mode = false;    if (token == PCDATA) {      /*       * In order to post-process the PCDATA token, we need to look ahead one       * token...       */      if (next_token == EOF) {        next_token = yylex2(&next_token_value, &next_token_tag_type);      }      /*       * Erase " '\n' { ' ' } " immediately before "</PRE>".       */      if (next_token == END_PRE) {        string &s(*value_return->strinG);        string::size_type x = s.length();        while (x > 0 && s[x - 1] == ' ') --x;        if (x > 0 && s[x - 1] == '\n') s.erase(x - 1, string::npos);      } else      /*       * Erase whitespace before end tag or block start tag.       */      if (!literal_mode && (        next_token_tag_type == END_TAG ||        next_token_tag_type == BLOCK_END_TAG ||        next_token_tag_type == BLOCK_START_TAG      )) {        string &s(*value_return->strinG);        string::size_type x = s.length();        while (x > 0 && isspace(s[x - 1])) --x;        s.erase(x, string::npos);      }      /*       * Collate sequences of whitespace, if not in "literal mode".       */      if (!literal_mode) {        string &s(*value_return->strinG);//      bool   whitespace_only = true;        for (string::size_type x = 0; x < s.length(); ++x) {          if (isspace(s[x])) {            string::size_type y;            for (y = x + 1; y < s.length() && isspace(s[y]); ++y);            s.replace(x, y - x, " ");          } else {//          whitespace_only = false;          }        }        if (s.empty()) { delete value_return->strinG; continue; }      }    }    /*     * Erase whitespace after start tag or block end tag, if not in "literal     * mode".     */    if (!literal_mode && (      (        tag_type == START_TAG ||        tag_type == BLOCK_START_TAG ||        tag_type == BLOCK_END_TAG ||        token == BR ||        token == HR      ) &&      token != SCRIPT && token != STYLE    )) {      if (next_token == EOF) {        next_token = yylex2(&next_token_value, &next_token_tag_type);      }      if (next_token == PCDATA) {        string &s(*next_token_value.strinG);        string::size_type x;        for (x = 0; x < s.length() && isspace(s[x]); ++x);        if (x > 0) s.erase(0, x);        if (s.empty()) {          delete next_token_value.strinG;          next_token = EOF;        }      }    }    return token;  }}/* ------------------------------------------------------------------------- *//* * Keep this array sorted alphabetically! */static const struct TextToIntP {  char      name[11];  char      block_tag;  const int *start_tag_code;  const int *end_tag_code;} tag_names[] = {#define pack1(tag) { #tag, 0, &HTMLParser::tag, 0 }#define pack2(tag) { #tag, 0, &HTMLParser::tag, &HTMLParser::END_##tag }#define pack3(tag) { #tag, 1, &HTMLParser::tag, &HTMLParser::END_##tag }  pack2(A),  pack3(ADDRESS),  pack2(APPLET),  pack1(AREA),  pack2(B),  pack1(BASE),  pack1(BASEFONT),  pack2(BIG),  pack3(BLOCKQUOTE),  pack3(BODY),  pack1(BR),  pack3(CAPTION),  pack3(CENTER),  pack3(CITE),  pack2(CODE),  pack3(DD),  pack2(DFN),  pack3(DIR),  pack3(DIV),  pack3(DL),  pack3(DT),  pack2(EM),  pack2(FONT),  pack3(FORM),  pack3(H1),  pack3(H2),  pack3(H3),  pack3(H4),  pack3(H5),  pack3(H6),  pack3(HEAD),  pack1(HR),  pack3(HTML),  pack2(I),  pack1(IMG),  pack1(INPUT),  pack1(ISINDEX),  pack2(KBD),  pack3(LI),  pack1(LINK),  pack2(MAP),  pack3(MENU),  pack1(META),  pack2(NOBR),  pack3(OL),  pack3(OPTION),  pack3(P),  pack1(PARAM),  pack3(PRE),  pack2(SAMP),  pack3(SCRIPT),  pack2(SELECT),  pack2(SMALL),  pack2(STRIKE),  pack2(STRONG),  pack3(STYLE),  pack2(SUB),  pack2(SUP),  pack3(TABLE),  pack3(TD),  pack2(TEXTAREA),  pack3(TH),  pack3(TITLE),  pack3(TR),  pack2(TT),  pack2(U),  pack3(UL),  pack2(VAR),#undef pack};/* ------------------------------------------------------------------------- */intHTMLControl::yylex2(yy_HTMLParser_stype *value_return, int *tag_type_return){  int c;  *tag_type_return = NOT_A_TAG;  for (;;) {   // Notice the "return" at the end of this loop.    /*     * Get the first character of the token.     */    c = get_char();    if (c == EOF) return EOF;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -