⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 html2text.c

📁 将HTML转换为TXT文件的程序
💻 C
字号:
/* ------------------------------------------------------------------------- *//* * Copyright (c) 1999 *      GMRS Software GmbH, Innsbrucker Ring 159, 81669 Munich, Germany. *      http://www.gmrs.de *      All rights reserved. *      Author: Arno Unkrig (arno.unkrig@gmrs.de) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software *    must display the following acknowledgement: *      This product includes software developed by GMRS Software GmbH. * 4. The name of GMRS Software GmbH may not be used to endorse or promote *    products derived from this software without specific prior written *    permission. * * THIS SOFTWARE IS PROVIDED BY GMRS SOFTWARE GMBH ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GMRS SOFTWARE GMBH BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. *//* ------------------------------------------------------------------------- */#ident "$Id: html2text.C,v 1.15 1999/12/07 18:29:44 arno Exp $"#include <iostream.h>#include <string.h>#include <stdlib.h>#include "html.h"#include "HTMLControl.h"#include "urlistream.h"#include "format.h"#define stringify(x) stringify2(x)#define stringify2(x) #x/* ------------------------------------------------------------------------- */class MyParser : public HTMLControl {public:  enum { PRINT_AS_ASCII, UNPARSE, SYNTAX_CHECK };  MyParser(    istream    &is_,    bool       debug_scanner_,    bool       debug_parser_,    ostream    &os_,    int        mode_,    int        width_,    const char *file_name_  ) :    HTMLControl(is_, debug_scanner_, debug_parser_),    os(os_),    mode(mode_),    width(width_),    file_name(file_name_)  {}private:  /*virtual*/ void yyerror(char *);  /*virtual*/ void process(const Document &);  ostream &os;  int     mode;  int     width;  string  file_name;};/*virtual*/ voidMyParser::yyerror(char *p){  /*   * Swallow parse error messages if not in "syntax check" mode.   */  if (mode != SYNTAX_CHECK && !strcmp(p, "parse error")) return;  cerr    << "File \""    << file_name    << "\", line "    << current_line    << ", column "    << current_column    << ": "    << p    << endl;}/*virtual*/ voidMyParser::process(const Document &document){  switch (mode) {  case PRINT_AS_ASCII:    document.format(/*indent_left*/ 0, width, Area::LEFT, os);    break;  case UNPARSE:    document.unparse(os, endl);    break;  case SYNTAX_CHECK:    break;  default:    cerr << "??? Invalid mode " << mode << " ??? " << endl;    exit(1);    break;  }}/* ------------------------------------------------------------------------- */static const char *usage = "\Usage:\n\  html2text -help\n\  html2text -version\n\  html2text [ -unparse | -check ] [ -debug-scanner ] [ -debug-parser ] \\\n\     [ -rcfile <file> ] [ -style ( compact | pretty ) ] [ -width <w> ] \\\n\     [ -o <file> ] [ -nobs ] [ <input-url> ] ...\n\Formats HTML document(s) read from <input-url> or STDIN and generates ASCII\n\text.\n\  -help          Print this text\n\  -version       Print the program version\n\  -unparse       Generate HTML instead of ASCII output\n\  -check         Do syntax checking only\n\  -debug-scanner Report parsed tokens on STDERR (debugging)\n\  -debug-parser  Report parser activity on STDERR (debugging)\n\  -rcfile <file> Read \"<file>\" instead of \"$HOME/.html2textrc\"\n\  -style compact Create a very \"compact\" output format (default)\n\  -style pretty  Insert some vertical space for nicer output\n\  -width <w>     Optimize for screen widths other than 79\n\  -o <file>      Redirect output into <file>\n\  -nobs          Do not use backspaces for boldface and underlining\n\";intmain(int argc, char **argv){  if (argc == 2 && !strcmp(argv[1], "-help")) {    cout      << "This is HTML2TEXT, version " stringify(VERSION) << endl      << "Check out http://www.gmrs.de for the latest version." << endl      << endl      << usage;    exit(0);  }  if (argc == 2 && !strcmp(argv[1], "-version")) {    cout << stringify(VERSION) << endl;    exit(0);  }  bool       mode              = MyParser::PRINT_AS_ASCII;  bool       debug_scanner     = false;  bool       debug_parser      = false;  const char *home             = getenv("HOME");  string     rcfile            = string(home ? home : "") + "/.html2textrc";  const char *style            = "compact";  int        width             = 79;  const char *output_file_name = "-";  bool       use_backspaces    = true;  int i;  for (i = 1; i < argc && argv[i][0] == '-' && argv[i][1]; i++) {    const char *arg = argv[i];    if (!strcmp(arg, "-unparse"      )) { mode = MyParser::UNPARSE;      } else    if (!strcmp(arg, "-check"        )) { mode = MyParser::SYNTAX_CHECK; } else    if (!strcmp(arg, "-debug-scanner")) { debug_scanner = true;          } else    if (!strcmp(arg, "-debug-parser" )) { debug_parser = true;           } else    if (!strcmp(arg, "-rcfile"       )) { rcfile = argv[++i];            } else    if (!strcmp(arg, "-style"        )) { style = argv[++i];             } else    if (!strcmp(arg, "-width"        )) { width = atoi(argv[++i]);       } else    if (!strcmp(arg, "-o"            )) { output_file_name = argv[++i];  } else    if (!strcmp(arg, "-nobs"         )) { use_backspaces = false;        } else    {      cerr	<< "Unrecognized command line option \""	<< arg	<< "\", try \"-help\"."	<< endl;      exit(1);    }  }  if (i > argc) {    cerr      << "Error: Required parameter after \""      << argv[argc - 1]      << "\" missing."      << endl;    exit(1);  }  const char *const *input_urls;  int        number_of_input_urls;  if (i >= argc) {    static const char *const x = "-";    input_urls = &x;    number_of_input_urls = 1;  } else {    input_urls = argv + i;    number_of_input_urls = argc - i;  }  /*   * Set up formatting: First, set some formatting properties depending on   * the "-style" command line option.   */  if (!strcmp(style, "compact")) {    ;  } else  if (!strcmp(style, "pretty")) {    /*     * The "pretty" style was kindly supplied by diligent user Rolf Niepraschk.     */    static const struct {      const char *key;      const char *value;    } properties[] = {      { "OL.TYPE",                  "1" },      { "OL.vspace.before",         "1" },      { "OL.vspace.after",          "1" },      { "OL.indents",               "5" },      { "UL.vspace.before",         "1" },      { "UL.vspace.after",          "1" },      { "UL.indents",               "2" },      { "DL.vspace.before",         "1" },      { "DL.vspace.after",          "1" },      { "DT.vspace.before",         "1" },      { "DIR.vspace.before",        "1" },      { "DIR.indents",              "2" },      { "MENU.vspace.before",       "1" },      { "MENU.vspace.after",        "1" },      { "DT.indent",                "2" },      { "DD.indent",                "6" },      { "HR.marker",                "-" },      { "H1.prefix",                ""  },      { "H2.prefix",                ""  },      { "H3.prefix",                ""  },      { "H4.prefix",                ""  },      { "H5.prefix",                ""  },      { "H6.prefix",                ""  },      { "H1.suffix",                ""  },      { "H2.suffix",                ""  },      { "H3.suffix",                ""  },      { "H4.suffix",                ""  },      { "H5.suffix",                ""  },      { "H6.suffix",                ""  },      { "H1.vspace.before",         "2" },      { "H2.vspace.before",         "1" },      { "H3.vspace.before",         "1" },      { "H4.vspace.before",         "1" },      { "H5.vspace.before",         "1" },      { "H6.vspace.before",         "1" },      { "H1.vspace.after",          "1" },      { "H2.vspace.after",          "1" },      { "H3.vspace.after",          "1" },      { "H4.vspace.after",          "1" },      { "H5.vspace.after",          "1" },      { "H6.vspace.after",          "1" },      { "TABLE.vspace.before",      "1" },      { "TABLE.vspace.after",       "1" },      { "CODE.vspace.before",       "0" },      { "CODE.vspace.after",        "0" },      { "BLOCKQUOTE.vspace.before", "1" },      { "BLOCKQUOTE.vspace.after",  "1" },      { "PRE.vspace.before",        "1" },      { "PRE.vspace.after",         "1" },      { "PRE.indent.left",          "2" },      { 0, 0 }    }, *p;    for (p = properties; p->key; ++p) {      Formatting::setProperty(p->key, p->value);    }  } else {    cerr      << "Unknown style \""      << style      << "\" specified -- try \"-help\"."      << endl;    ::exit(1);  }  {    ifstream ifs(rcfile.c_str());    if (!ifs.rdbuf()->is_open()) ifs.open("/etc/html2textrc");    if (ifs.rdbuf()->is_open()) {      Formatting::loadProperties(ifs);    }  }  /*   * Set up printing.   */  Area::use_backspaces = use_backspaces;  ostream  *osp;  ofstream ofs;  if (!strcmp(output_file_name, "-")) {    osp = &cout;  } else {    ofs.open(output_file_name, ios::out);    if (!ofs) {      cerr        << "Could not open output file \""        << output_file_name        << "\"."        << endl;    exit(1);    }    osp = &ofs;  }  for (i = 0; i < number_of_input_urls; ++i) {    const char *input_url = input_urls[i];    if (number_of_input_urls != 1) {      *osp << "###### " << input_url << " ######" << endl;    }    istream    *isp;    urlistream uis;    if (!strcmp(input_url, "-")) {      isp = &cin;    } else {      uis.open(input_url);      if (!uis.is_open()) {        cerr          << "Opening input URL \""          << input_url          << "\": "	  << uis.open_error()          << endl;        exit(1);      }      isp = &uis;    }    MyParser parser(      *isp,      debug_scanner,      debug_parser,      *osp,      mode,      width,      input_url    );    if (parser.yyparse() != 0) exit(1);  }  return 0;}/* ------------------------------------------------------------------------- */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -