📄 sgml.c

📁 将HTML转换为TXT文件的程序
💻 C
字号:
/* ------------------------------------------------------------------------- *//* * Copyright (c) 1999 *      GMRS Software GmbH, Innsbrucker Ring 159, 81669 Munich, Germany. *      http://www.gmrs.de *      All rights reserved. *      Author: Arno Unkrig (arno.unkrig@gmrs.de) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software *    must display the following acknowledgement: *      This product includes software developed by GMRS Software GmbH. * 4. The name of GMRS Software GmbH may not be used to endorse or promote *    products derived from this software without specific prior written *    permission. * * THIS SOFTWARE IS PROVIDED BY GMRS SOFTWARE GMBH ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GMRS SOFTWARE GMBH BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. *//* ------------------------------------------------------------------------- */#ident "$Id: sgml.C,v 1.6 1999/11/08 12:21:31 arno Exp $"#include <stdlib.h>#include <string.h>#include <ctype.h>#include "html.h"#include "sgml.h"#ifndef nelems#define nelems(array) (sizeof(array) / sizeof((array)[0]))#endif/* ------------------------------------------------------------------------- *//* * Keep this array sorted alphabetically! */static const struct TextToInt {  char name[7];  int  code;} latin1_entities[] = {  { "AElig",  LATIN1_AElig   },  { "Aacute", LATIN1_Aacute  },  { "Acirc",  LATIN1_Acirc   },  { "Agrave", LATIN1_Agrave  },  { "Aring",  LATIN1_Aring   },  { "Atilde", LATIN1_Atilde  },  { "Auml",   LATIN1_Auml    },  { "Ccedil", LATIN1_Ccedil  },  { "ETH",    LATIN1_ETH     },  { "Eacute", LATIN1_Eacute  },  { "Ecirc",  LATIN1_Ecirc   },  { "Egrave", LATIN1_Egrave  },  { "Euml",   LATIN1_Euml    },  { "Iacute", LATIN1_Iacute  },  { "Icirc",  LATIN1_Icirc   },  { "Igrave", LATIN1_Igrave  },  { "Iuml",   LATIN1_Iuml    },  { "Ntilde", LATIN1_Ntilde  },  { "Oacute", LATIN1_Oacute  },  { "Ocirc",  LATIN1_Ocirc   },  { "Ograve", LATIN1_Ograve  },  { "Oslash", LATIN1_Oslash  },  { "Otilde", LATIN1_Otilde  },  { "Ouml",   LATIN1_Ouml    },  { "THORN",  LATIN1_THORN   },  { "Uacute", LATIN1_Uacute  },  { "Ucirc",  LATIN1_Ucirc   },  { "Ugrave", LATIN1_Ugrave  },  { "Uuml",   LATIN1_Uuml    },  { "Yacute", LATIN1_Yacute  },  { "aacute", LATIN1_aacute  },  { "acirc",  LATIN1_acirc   },  { "acute",  LATIN1_acute   },  { "aelig",  LATIN1_aelig   },  { "agrave", LATIN1_agrave  },  { "amp",    '&'            },  { "aring",  LATIN1_aring   },  { "atilde", LATIN1_atilde  },  { "auml",   LATIN1_auml    },  { "brvbar", LATIN1_brvbar  },  { "ccedil", LATIN1_ccedil  },  { "cedil",  LATIN1_cedil   },  { "cent",   LATIN1_cent    },  { "copy",   LATIN1_copy    },  { "curren", LATIN1_curren  },  { "deg",    LATIN1_deg     },  { "divide", LATIN1_divide  },  { "eacute", LATIN1_eacute  },  { "ecirc",  LATIN1_ecirc   },  { "egrave", LATIN1_egrave  },  { "eth",    LATIN1_eth     },  { "euml",   LATIN1_euml    },  { "frac12", LATIN1_frac12  },  { "frac14", LATIN1_frac14  },  { "frac34", LATIN1_frac34  },  { "gt",     '>'            },  { "iacute", LATIN1_iacute  },  { "icirc",  LATIN1_icirc   },  { "iexcl",  LATIN1_iexcl   },  { "igrave", LATIN1_igrave  },  { "iquest", LATIN1_iquest  },  { "iuml",   LATIN1_iuml    },  { "laquo",  LATIN1_laquo   },  { "lt",     '<'            },  { "macr",   LATIN1_macr    },  { "micro",  LATIN1_micro   },  { "middot", LATIN1_middot  },  { "nbsp",   LATIN1_nbsp    },  { "not",    LATIN1_not     },  { "ntilde", LATIN1_ntilde  },  { "oacute", LATIN1_oacute  },  { "ocirc",  LATIN1_ocirc   },  { "ograve", LATIN1_ograve  },  { "ordf",   LATIN1_ordf    },  { "ordm",   LATIN1_ordm    },  { "oslash", LATIN1_oslash  },  { "otilde", LATIN1_otilde  },  { "ouml",   LATIN1_ouml    },  { "para",   LATIN1_para    },  { "plusmn", LATIN1_plusmn  },  { "pound",  LATIN1_pound   },  { "quot",   '"'            },  { "raquo",  LATIN1_raquo   },  { "reg",    LATIN1_reg     },  { "sect",   LATIN1_sect    },  { "shy",    LATIN1_shy     },  { "sup1",   LATIN1_sup1    },  { "sup2",   LATIN1_sup2    },  { "sup3",   LATIN1_sup3    },  { "szlig",  LATIN1_szlig   },  { "thorn",  LATIN1_thorn   },  { "times",  LATIN1_times   },  { "uacute", LATIN1_uacute  },  { "ucirc",  LATIN1_ucirc   },  { "ugrave", LATIN1_ugrave  },  { "uml",    LATIN1_uml     },  { "uuml",   LATIN1_uuml    },  { "yacute", LATIN1_yacute  },  { "yen",    LATIN1_yen     },  { "yuml",   LATIN1_yuml    },};/* ------------------------------------------------------------------------- */voidreplace_sgml_entities(string *s){  string::size_type j = 0;    for (;;) {    string::size_type l = s->length();    /*     * Skip characters before ampersand.     */    while (j < l && s->at(j) != '&') ++j;    if (j >= l) break;    /*     * So we have an ampersand...     */    /*     * Don't process the last three characters; an SGML entity wouldn't fit     * in anyway!     */    if (j + 3 >= l) break;          // Watch out! Unsigned arithmetics!    string::size_type beg = j++;    // Skip the ampersand;    /*     * Look at the next character.     */    char c = s->at(j++);    if (c == '#') {      /*       * Decode entities like "&#233;".       * Some authors forget the ";", but we tolerate this.       */      c = s->at(j++);      if (isdigit(c)) {        int x = c - '0';        for (; j < l; ++j) {          c = s->at(j);          if (c == ';') { ++j; break; }          if (!isdigit(c)) break;          x = 10 * x + c - '0';        }        s->replace(beg, j - beg, 1, (char) x);        j = beg + 1;      }    } else    if (isalpha(c)) {      /*       * Decode entities like "&nbsp;".       * Some authors forget the ";", but we tolerate this.       */      char name[8];      name[0] = c;      size_t i = 1;      for (; j < l; ++j) {        c = s->at(j);        if (c == ';') { ++j; break; }        if (!isalnum(c)) break;        if (i < sizeof(name) - 1) name[i++] = c;      }      name[i] = '\0';      const TextToInt *entity = (const TextToInt *) bsearch(        name,        latin1_entities, nelems(latin1_entities), sizeof(TextToInt),        (int (*)(const void *, const void *)) strcmp      );      if (entity != NULL) {        s->replace(beg, j - beg, 1, (char) entity->code);        j = beg + 1;      }    } else {      ;                         /* EXTENSION: Allow literal '&' sometimes. */    }  }}/* ------------------------------------------------------------------------- */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -