📄 htmlparser.y

📁 将HTML转换为TXT文件的程序
💻 Y
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* ------------------------------------------------------------------------- *//* * Copyright (c) 1999 *      GMRS Software GmbH, Innsbrucker Ring 159, 81669 Munich, Germany. *      http://www.gmrs.de *      All rights reserved. *      Author: Arno Unkrig (arno.unkrig@gmrs.de) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software *    must display the following acknowledgement: *      This product includes software developed by GMRS Software GmbH. * 4. The name of GMRS Software GmbH may not be used to endorse or promote *    products derived from this software without specific prior written *    permission. * * THIS SOFTWARE IS PROVIDED BY GMRS SOFTWARE GMBH ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GMRS SOFTWARE GMBH BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. *//* ------------------------------------------------------------------------- */%name HTMLParser%define PURE%define DEBUG 1%{/* ------------------------------------------------------------------------- */#ident "$Id: HTMLParser.y,v 1.14 1999/10/26 10:56:55 arno Exp $"#include "html.h"#include "HTMLParser.h"// MIPS machines don't have "alloca()", so disable stack realloc'ing.#ifdef mips#define yyoverflow yyerror("parser stack overflow"), (void)#endif/* ------------------------------------------------------------------------- */%}/* ------------------------------------------------------------------------- */%define LEX_BODY = 0%define ERROR_BODY = 0%define MEMBERS\  virtual ~HTMLParser(); \  virtual void process(const Document &) = 0;\  virtual bool read_cdata(const char *terminal, string *) = 0;\  int list_nesting;%define CONSTRUCTOR_INIT : list_nesting(0)%union {  Document                   *document;  Element                    *element;  list<auto_ptr<Element> >   *element_list;  PCData                     *pcdata;  string                     *strinG;  list<TagAttribute>         *tag_attributes;  int                        inT;  list<auto_ptr<TableRow> >  *table_rows;  list<auto_ptr<TableCell> > *table_cells;  ListItem                   *list_item;  list<auto_ptr<ListItem> >  *list_items;  Caption                    *caption;  Heading                    *heading;  list<auto_ptr<Option> >    *option_list;  Option                     *option;  DefinitionList             *definition_list;  list<auto_ptr<DefinitionListItem> > *definition_list_item_list;  TermName                   *term_name;  TermDefinition             *term_definition;  Preformatted               *preformatted;  Address                    *address;  list<auto_ptr<list<TagAttribute> > > *tag_attributes_list;}%type  <document>                 document_%type  <pcdata>                   pcdata%type  <pcdata>                   opt_pcdata%type  <element_list>             body_content%type  <heading>                  heading%type  <heading>                  HX%type  <inT>                      END_HX%type  <element>                  block%type  <element>                  block_except_p%type  <element>                  text%type  <element_list>             texts%type  <element_list>             opt_texts%type  <element>                  font%type  <element>                  phrase%type  <element>                  special%type  <element>                  form%type  <table_rows>               table_rows%type  <table_cells>              table_cells%type  <caption>                  caption%type  <caption>                  opt_caption%type  <element_list>             applet_content%type  <definition_list>          definition_list%type  <definition_list_item_list>definition_list_content%type  <term_name>                term_name%type  <term_definition>          term_definition%type  <option_list>              select_content%type  <option>                   option%type  <element>                  list%type  <list_items>               list_content%type  <list_item>                list_item%type  <preformatted>             preformatted%type  <element_list>             opt_flow%type  <element_list>             flow%type  <element>                  flow_%type  <element_list>             paragraph_content%type  <address>                  address%type  <tag_attributes_list>      map_content%type  <tag_attributes> opt_LI%type  <tag_attributes> opt_P%token                  DOCTYPE%token <strinG>         PCDATA%token                  SCAN_ERROR%token <tag_attributes> A%token <tag_attributes> ADDRESS%token <tag_attributes> APPLET%token <tag_attributes> AREA%token <tag_attributes> B%token <tag_attributes> BASE%token <tag_attributes> BASEFONT%token <tag_attributes> BIG%token <tag_attributes> BLOCKQUOTE%token <tag_attributes> BODY%token <tag_attributes> BR%token <tag_attributes> CAPTION%token <tag_attributes> CENTER%token <tag_attributes> CITE%token <tag_attributes> CODE%token <tag_attributes> DD%token <tag_attributes> DFN%token <tag_attributes> DIR%token <tag_attributes> DIV%token <tag_attributes> DL%token <tag_attributes> DT%token <tag_attributes> EM%token <tag_attributes> FONT%token <tag_attributes> FORM%token <tag_attributes> H1%token <tag_attributes> H2%token <tag_attributes> H3%token <tag_attributes> H4%token <tag_attributes> H5%token <tag_attributes> H6%token <tag_attributes> HEAD%token <tag_attributes> HR%token <tag_attributes> HTML%token <tag_attributes> I%token <tag_attributes> IMG%token <tag_attributes> INPUT%token <tag_attributes> ISINDEX%token <tag_attributes> KBD%token <tag_attributes> LI%token <tag_attributes> LINK%token <tag_attributes> MAP%token <tag_attributes> MENU%token <tag_attributes> META%token <tag_attributes> NOBR%token <tag_attributes> OL%token <tag_attributes> OPTION%token <tag_attributes> P%token <tag_attributes> PARAM%token <tag_attributes> PRE%token <tag_attributes> SAMP%token <tag_attributes> SCRIPT%token <tag_attributes> SELECT%token <tag_attributes> SMALL%token <tag_attributes> STRIKE%token <tag_attributes> STRONG%token <tag_attributes> STYLE%token <tag_attributes> SUB%token <tag_attributes> SUP%token <tag_attributes> TABLE%token <tag_attributes> TD%token <tag_attributes> TEXTAREA%token <tag_attributes> TH%token <tag_attributes> TITLE%token <tag_attributes> TR%token <tag_attributes> TT%token <tag_attributes> U%token <tag_attributes> UL%token <tag_attributes> VAR%token                  END_A%token                  END_ADDRESS%token                  END_APPLET%token                  END_B%token                  END_BIG%token                  END_BLOCKQUOTE%token                  END_BODY%token                  END_CAPTION%token                  END_CENTER%token                  END_CITE%token                  END_CODE%token                  END_DD%token                  END_DFN%token                  END_DIR%token                  END_DIV%token                  END_DL%token                  END_DT%token                  END_EM%token                  END_FONT%token                  END_FORM%token                  END_H1%token                  END_H2%token                  END_H3%token                  END_H4%token                  END_H5%token                  END_H6%token                  END_HEAD%token                  END_HTML%token                  END_I%token                  END_KBD%token                  END_LI%token                  END_MAP%token                  END_MENU%token                  END_NOBR%token                  END_OL%token                  END_OPTION%token                  END_P%token                  END_PRE%token                  END_SAMP%token                  END_SCRIPT%token                  END_SELECT%token                  END_SMALL%token                  END_STRIKE%token                  END_STRONG%token                  END_STYLE%token                  END_SUB%token                  END_SUP%token                  END_TABLE%token                  END_TD%token                  END_TEXTAREA%token                  END_TH%token                  END_TITLE%token                  END_TR%token                  END_TT%token                  END_U%token                  END_UL%token                  END_VAR/* ------------------------------------------------------------------------- */%start document%% /* { */document:  document_ {    process(*$1);    delete $1;  }  ;/* * Well... actually, an HTML document should look like * * <!DOCTYPE ...> * <HTML> *   <HEAD> *   ... *   </HEAD> *   <BODY> *   ... *   </BODY> * </HTML> * * but... * * (A) All seven tags are optional * (B) The contents of the HEAD and the BODY section can be distinuished * (C) Most people out there do not know which element to put before, into, *     or after which section... * * so... let's just forget about the structure of an HTML document, discard * the seven tags, and process the remainder as a series of sections. */document_:  /* empty */ {    $$ = new Document;    $$->body.content.reset(new list<auto_ptr<Element> >);  }  | document_ error {    $$ = $1;  }  | document_ DOCTYPE {    $$ = $1;  }  | document_ HTML {    $$->attributes.reset($2);    $$ = $1;  }  | document_ END_HTML {    $$ = $1;  }  | document_ HEAD {    delete $2;    $$ = $1;  }  | document_ END_HEAD {    $$ = $1;  }  | document_ TITLE opt_pcdata opt_END_TITLE {    delete $2; // Ignore <TITLE> attributes    ($$ = $1)->head.title.reset($3);  }  | document_ ISINDEX {    ($$ = $1)->head.isindex_attributes.reset($2);  }  | document_ BASE {    ($$ = $1)->head.base_attributes.reset($2);  }  | document_ META {    ($$ = $1)->head.meta_attributes.reset($2);  }  | document_ LINK {    ($$ = $1)->head.link_attributes.reset($2);  }  | document_ SCRIPT {    auto_ptr<Script> s(new Script);    s->attributes.reset($2);    if (!read_cdata("</SCRIPT>", &s->text)) {      yyerror("CDATA terminal not found");    }    ($$ = $1)->head.scripts.push_back(s);  }  | document_ STYLE {    auto_ptr<Style> s(new Style);    s->attributes.reset($2);    if (!read_cdata("</STYLE>", &s->text)) {      yyerror("CDATA terminal not found");    }    ($$ = $1)->head.styles.push_back(s);  }  | document_ BODY {    delete $2;    $$ = $1;  }  | document_ END_BODY {    $$ = $1;  }  | document_ texts {    Paragraph *p = new Paragraph;    p->texts.reset($2);    ($$ = $1)->body.content->push_back(auto_ptr<Element>(p));  }  | document_ heading {    ($$ = $1)->body.content->push_back(auto_ptr<Element>($2));  }  | document_ block {    ($$ = $1)->body.content->push_back(auto_ptr<Element>($2));  }  | document_ address {    ($$ = $1)->body.content->push_back(auto_ptr<Element>($2));  }  ;pcdata:  PCDATA {    $$ = new PCData;    $$->text = *$1;    delete $1;  }  ;body_content:  /* empty */ {    $$ = new list<auto_ptr<Element> >;  }  | body_content error {    $$ = $1;  }  | body_content texts {    Paragraph *p = new Paragraph;    p->texts = auto_ptr<list<auto_ptr<Element> > >($2);    ($$ = $1)->push_back(auto_ptr<Element>(p));  }  | body_content heading {    ($$ = $1)->push_back(auto_ptr<Element>($2));  }  | body_content block {    ($$ = $1)->push_back(auto_ptr<Element>($2));  }  | body_content address {    ($$ = $1)->push_back(auto_ptr<Element>($2));  }  ;heading:  HX paragraph_content END_HX {            /* EXTENSION: Allow paragraph content in heading, not only texts */    if ($1->level != $3) {      yyerror ("Levels of opening and closing headings don't match");    }    $$ = $1;    $$->content.reset($2);  }  ;block:  block_except_p {    $$ = $1;  }  | P paragraph_content opt_END_P {    Paragraph *p = new Paragraph;    p->attributes.reset($1);    p->texts.reset($2);    $$ = p;  }  ;paragraph_content:  /* EXTENSION: Allow blocks (except "<P>") in paragraphs. */  /* empty */ {    $$ = new list<auto_ptr<Element> >;  }  | paragraph_content error {    $$ = $1;  }  | paragraph_content texts {    $$ = $1;    $$->splice($$->end(), *$2);    delete $2;  }  | paragraph_content block_except_p {    ($$ = $1)->push_back(auto_ptr<Element>($2));  }  ;block_except_p:  list {    $$ = $1;  }  | preformatted {    $$ = $1;  }  | definition_list {    $$ = $1;  }  | DIV body_content END_DIV {    Division *p = new Division;    p->attributes.reset($1);    p->body_content.reset($2);    $$ = p;  }  | CENTER body_content opt_END_CENTER {    Center *p = new Center;    delete $1;       // CENTER has no attributes.    p->body_content.reset($2);    $$ = p;  }  | BLOCKQUOTE body_content END_BLOCKQUOTE {    delete $1; // BLOCKQUOTE has no attributes!    BlockQuote *bq = new BlockQuote;    bq->content.reset($2);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -