📄 html2text.c
字号:
/* ------------------------------------------------------------------------- *//* * Copyright (c) 1999 * GMRS Software GmbH, Innsbrucker Ring 159, 81669 Munich, Germany. * http://www.gmrs.de * All rights reserved. * Author: Arno Unkrig (arno.unkrig@gmrs.de) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by GMRS Software GmbH. * 4. The name of GMRS Software GmbH may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY GMRS SOFTWARE GMBH ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GMRS SOFTWARE GMBH BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. *//* ------------------------------------------------------------------------- */#ident "$Id: html2text.C,v 1.15 1999/12/07 18:29:44 arno Exp $"#include <iostream.h>#include <string.h>#include <stdlib.h>#include "html.h"#include "HTMLControl.h"#include "urlistream.h"#include "format.h"#define stringify(x) stringify2(x)#define stringify2(x) #x/* ------------------------------------------------------------------------- */class MyParser : public HTMLControl {public: enum { PRINT_AS_ASCII, UNPARSE, SYNTAX_CHECK }; MyParser( istream &is_, bool debug_scanner_, bool debug_parser_, ostream &os_, int mode_, int width_, const char *file_name_ ) : HTMLControl(is_, debug_scanner_, debug_parser_), os(os_), mode(mode_), width(width_), file_name(file_name_) {}private: /*virtual*/ void yyerror(char *); /*virtual*/ void process(const Document &); ostream &os; int mode; int width; string file_name;};/*virtual*/ voidMyParser::yyerror(char *p){ /* * Swallow parse error messages if not in "syntax check" mode. */ if (mode != SYNTAX_CHECK && !strcmp(p, "parse error")) return; cerr << "File \"" << file_name << "\", line " << current_line << ", column " << current_column << ": " << p << endl;}/*virtual*/ voidMyParser::process(const Document &document){ switch (mode) { case PRINT_AS_ASCII: document.format(/*indent_left*/ 0, width, Area::LEFT, os); break; case UNPARSE: document.unparse(os, endl); break; case SYNTAX_CHECK: break; default: cerr << "??? Invalid mode " << mode << " ??? " << endl; exit(1); break; }}/* ------------------------------------------------------------------------- */static const char *usage = "\Usage:\n\ html2text -help\n\ html2text -version\n\ html2text [ -unparse | -check ] [ -debug-scanner ] [ -debug-parser ] \\\n\ [ -rcfile <file> ] [ -style ( compact | pretty ) ] [ -width <w> ] \\\n\ [ -o <file> ] [ -nobs ] [ <input-url> ] ...\n\Formats HTML document(s) read from <input-url> or STDIN and generates ASCII\n\text.\n\ -help Print this text\n\ -version Print the program version\n\ -unparse Generate HTML instead of ASCII output\n\ -check Do syntax checking only\n\ -debug-scanner Report parsed tokens on STDERR (debugging)\n\ -debug-parser Report parser activity on STDERR (debugging)\n\ -rcfile <file> Read \"<file>\" instead of \"$HOME/.html2textrc\"\n\ -style compact Create a very \"compact\" output format (default)\n\ -style pretty Insert some vertical space for nicer output\n\ -width <w> Optimize for screen widths other than 79\n\ -o <file> Redirect output into <file>\n\ -nobs Do not use backspaces for boldface and underlining\n\";intmain(int argc, char **argv){ if (argc == 2 && !strcmp(argv[1], "-help")) { cout << "This is HTML2TEXT, version " stringify(VERSION) << endl << "Check out http://www.gmrs.de for the latest version." << endl << endl << usage; exit(0); } if (argc == 2 && !strcmp(argv[1], "-version")) { cout << stringify(VERSION) << endl; exit(0); } bool mode = MyParser::PRINT_AS_ASCII; bool debug_scanner = false; bool debug_parser = false; const char *home = getenv("HOME"); string rcfile = string(home ? home : "") + "/.html2textrc"; const char *style = "compact"; int width = 79; const char *output_file_name = "-"; bool use_backspaces = true; int i; for (i = 1; i < argc && argv[i][0] == '-' && argv[i][1]; i++) { const char *arg = argv[i]; if (!strcmp(arg, "-unparse" )) { mode = MyParser::UNPARSE; } else if (!strcmp(arg, "-check" )) { mode = MyParser::SYNTAX_CHECK; } else if (!strcmp(arg, "-debug-scanner")) { debug_scanner = true; } else if (!strcmp(arg, "-debug-parser" )) { debug_parser = true; } else if (!strcmp(arg, "-rcfile" )) { rcfile = argv[++i]; } else if (!strcmp(arg, "-style" )) { style = argv[++i]; } else if (!strcmp(arg, "-width" )) { width = atoi(argv[++i]); } else if (!strcmp(arg, "-o" )) { output_file_name = argv[++i]; } else if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else { cerr << "Unrecognized command line option \"" << arg << "\", try \"-help\"." << endl; exit(1); } } if (i > argc) { cerr << "Error: Required parameter after \"" << argv[argc - 1] << "\" missing." << endl; exit(1); } const char *const *input_urls; int number_of_input_urls; if (i >= argc) { static const char *const x = "-"; input_urls = &x; number_of_input_urls = 1; } else { input_urls = argv + i; number_of_input_urls = argc - i; } /* * Set up formatting: First, set some formatting properties depending on * the "-style" command line option. */ if (!strcmp(style, "compact")) { ; } else if (!strcmp(style, "pretty")) { /* * The "pretty" style was kindly supplied by diligent user Rolf Niepraschk. */ static const struct { const char *key; const char *value; } properties[] = { { "OL.TYPE", "1" }, { "OL.vspace.before", "1" }, { "OL.vspace.after", "1" }, { "OL.indents", "5" }, { "UL.vspace.before", "1" }, { "UL.vspace.after", "1" }, { "UL.indents", "2" }, { "DL.vspace.before", "1" }, { "DL.vspace.after", "1" }, { "DT.vspace.before", "1" }, { "DIR.vspace.before", "1" }, { "DIR.indents", "2" }, { "MENU.vspace.before", "1" }, { "MENU.vspace.after", "1" }, { "DT.indent", "2" }, { "DD.indent", "6" }, { "HR.marker", "-" }, { "H1.prefix", "" }, { "H2.prefix", "" }, { "H3.prefix", "" }, { "H4.prefix", "" }, { "H5.prefix", "" }, { "H6.prefix", "" }, { "H1.suffix", "" }, { "H2.suffix", "" }, { "H3.suffix", "" }, { "H4.suffix", "" }, { "H5.suffix", "" }, { "H6.suffix", "" }, { "H1.vspace.before", "2" }, { "H2.vspace.before", "1" }, { "H3.vspace.before", "1" }, { "H4.vspace.before", "1" }, { "H5.vspace.before", "1" }, { "H6.vspace.before", "1" }, { "H1.vspace.after", "1" }, { "H2.vspace.after", "1" }, { "H3.vspace.after", "1" }, { "H4.vspace.after", "1" }, { "H5.vspace.after", "1" }, { "H6.vspace.after", "1" }, { "TABLE.vspace.before", "1" }, { "TABLE.vspace.after", "1" }, { "CODE.vspace.before", "0" }, { "CODE.vspace.after", "0" }, { "BLOCKQUOTE.vspace.before", "1" }, { "BLOCKQUOTE.vspace.after", "1" }, { "PRE.vspace.before", "1" }, { "PRE.vspace.after", "1" }, { "PRE.indent.left", "2" }, { 0, 0 } }, *p; for (p = properties; p->key; ++p) { Formatting::setProperty(p->key, p->value); } } else { cerr << "Unknown style \"" << style << "\" specified -- try \"-help\"." << endl; ::exit(1); } { ifstream ifs(rcfile.c_str()); if (!ifs.rdbuf()->is_open()) ifs.open("/etc/html2textrc"); if (ifs.rdbuf()->is_open()) { Formatting::loadProperties(ifs); } } /* * Set up printing. */ Area::use_backspaces = use_backspaces; ostream *osp; ofstream ofs; if (!strcmp(output_file_name, "-")) { osp = &cout; } else { ofs.open(output_file_name, ios::out); if (!ofs) { cerr << "Could not open output file \"" << output_file_name << "\"." << endl; exit(1); } osp = &ofs; } for (i = 0; i < number_of_input_urls; ++i) { const char *input_url = input_urls[i]; if (number_of_input_urls != 1) { *osp << "###### " << input_url << " ######" << endl; } istream *isp; urlistream uis; if (!strcmp(input_url, "-")) { isp = &cin; } else { uis.open(input_url); if (!uis.is_open()) { cerr << "Opening input URL \"" << input_url << "\": " << uis.open_error() << endl; exit(1); } isp = &uis; } MyParser parser( *isp, debug_scanner, debug_parser, *osp, mode, width, input_url ); if (parser.yyparse() != 0) exit(1); } return 0;}/* ------------------------------------------------------------------------- */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -