📄 html2texi.pl

📁 python s60 1.4.5版本的源代码
💻 PL
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
#! /usr/bin/env perl
# html2texi.pl -- Convert HTML documentation to Texinfo format
# Michael Ernst <mernst@cs.washington.edu>
# Time-stamp: <1999-01-12 21:34:27 mernst>

# This program converts HTML documentation trees into Texinfo format.
# Given the name of a main (or contents) HTML file, it processes that file,
# and other files (transitively) referenced by it, into a Texinfo file
# (whose name is chosen from the file or directory name of the argument).
# For instance:
#   html2texi.pl api/index.html
# produces file "api.texi".

# Texinfo format can be easily converted to Info format (for browsing in
# Emacs or the standalone Info browser), to a printed manual, or to HTML.
# Thus, html2texi.pl permits conversion of HTML files to Info format, and
# secondarily enables producing printed versions of Web page hierarchies.

# Unlike HTML, Info format is searchable.  Since Info is integrated into
# Emacs, one can read documentation without starting a separate Web
# browser.  Additionally, Info browsers (including Emacs) contain
# convenient features missing from Web browsers, such as easy index lookup
# and mouse-free browsing.

# Limitations:
# html2texi.pl is currently tuned to latex2html output (and it corrects
# several latex2html bugs), but should be extensible to arbitrary HTML
# documents.  It will be most useful for HTML with a hierarchical structure
# and an index, and it recognizes those features as created by latex2html
# (and possibly by some other tools).  The HTML tree to be traversed must
# be on local disk, rather than being accessed via HTTP.
# This script requires the use of "checkargs.pm".  To eliminate that
# dependence, replace calls to check_args* by @_ (which is always the last
# argument to those functions).
# Also see the "to do" section, below.
# Comments, suggestions, bug fixes, and enhancements are welcome.

# Troubleshooting:
# Malformed HTML can cause this program to abort, so
# you should check your HTML files to make sure they are legal.


###
### Typical usage for the Python documentation:
###

# (Actually, most of this is in a Makefile instead.)
# The resulting Info format Python documentation is currently available at
# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz

# Fix up HTML problems, eg <DT><DL COMPACT><DD> should be <DT><DL COMPACT><DD>.

# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html

# Edit the generated .texi files:
#   * change @setfilename to prefix "python-"
#   * fix up any sectioning, such as for Abstract
#   * make Texinfo menus
#   * perhaps remove the @detailmenu ... @end detailmenu
# In Emacs, to do all this:
#   (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))

# makeinfo api.texi
# makeinfo ext.texi
# makeinfo lib.texi
# makeinfo mac.texi
# makeinfo ref.texi
# makeinfo tut.texi


###
### Structure of the code
###

# To be written...


###
### Design decisions
###

# Source and destination languages
# --------------------------------
# 
# The goal is Info files; I create Texinfo, so I don't have to worry about
# the finer details of Info file creation.  (I'm not even sure of its exact
# format.)
# 
# Why not start from LaTeX rather than HTML?
# I could hack latex2html itself to produce Texinfo instead, or fix up
# partparse.py (which already translates LaTeX to Teinfo).
#  Pros:
#   * has high-level information such as index entries, original formatting
#  Cons:
#   * those programs are complicated to read and understand
#   * those programs try to handle arbitrary LaTeX input, track catcodes,
#     and more:  I don't want to go to that effort.  HTML isn't as powerful
#     as LaTeX, so there are fewer subtleties.
#   * the result wouldn't work for arbitrary HTML documents; it would be
#     nice to eventually extend this program to HTML produced from Docbook,
#     Frame, and more.

# Parsing
# -------
# 
# I don't want to view the text as a linear stream; I'd rather parse the
# whole thing and then do pattern matching over the parsed representation (to
# find idioms such as indices, lists of child nodes, etc.).
#  * Perl provides HTML::TreeBuilder, which does just what I want.
#     * libwww-perl: http://www.linpro.no/lwp/
#     * TreeBuilder: HTML-Tree-0.51.tar.gz
#  * Python Parsers, Formatters, and Writers don't really provide the right
#    interface (and the version in Grail doesn't correspond to another
#    distributed version, so I'm confused about which to be using).  I could
#    write something in Python that creates a parse tree, but why bother?

# Other implementation language issues:
#  * Python lacks variable declarations, reasonable scoping, and static
#    checking tools.  I've written some of the latter for myself that make
#    my Perl programming a lot safer than my Python programming will be until
#    I have a similar suite for that language.


###########################################################################
### To do
###

# Section names:
#   Fix the problem with multiple sections in a single file (eg, Abstract in
#     Front Matter section).
#   Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
# Index:
#   Perhaps double-check that every tag mentioned in the index is found
#     in the text.
# Python:  email to python-docs@python.org, to get their feedback.
#   Compare to existing lib/ Info manual
#   Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
#   Postpass to remove extra quotation marks around typography already in
#     a different font (to avoid double delimiters as in "`code'"); or
#     perhaps consider using only font-based markup so that we don't get
#     the extra *bold* and `code' markup in Info.

## Perhaps don't rely on automatic means for adding up, next, prev; I have
## all that info available to me already, so it's not so much trouble to
## add it.  (Right?)  But it is *so* easy to use Emacs instead...


###########################################################################
### Strictures
###

# man HTML::TreeBuilder
# man HTML::Parser
# man HTML::Element

# require HTML::ParserWComment;
require HTML::Parser;
require HTML::TreeBuilder;
require HTML::Element;

use File::Basename;

use strict;
# use Carp;

use checkargs;


###########################################################################
### Variables
###

my @section_stack = ();		# elements are chapter/section/subsec nodetitles (I think)
my $current_ref_tdf;		# for the file currently being processed;
				#  used in error messages
my $html_directory;
my %footnotes;

# First element should not be used.
my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");

my %inline_markup = ("b" => "strong",
		     "code" => "code",
		     "i" => "emph",
		     "kbd" => "kbd",
		     "samp" => "samp",
		     "strong" => "strong",
		     "tt" => "code",
		     "var" => "var");

my @deferred_index_entries = ();

my @index_titles = ();		# list of (filename, type) lists
my %index_info = ("Index" => ["\@blindex", "bl"],
		  "Concept Index" => ["\@cindex", "cp"],
		  "Module Index" => ["\@mdindex", "md"]);


###########################################################################
### Main/contents page
###

# Process first-level page on its own, or just a contents page?  Well, I do
# want the title, author, etc., and the front matter...  For now, just add
# that by hand at the end.


# data structure possibilities:
#  * tree-like (need some kind of stack when processing (or parent pointers))
#  * list of name and depth; remember old and new depths.

# Each element is a reference to a list of (nodetitle, depth, filename).
my @contents_list = ();

# The problem with doing fixups on the fly is that some sections may have
# already been processed (and no longer available) by the time we notice
# others with the same name.  It's probably better to fully construct the
# contents list (reading in all files of interest) upfront; that will also
# let me do a better job with cross-references, because again, all files
# will already be read in.
my %contents_hash = ();
my %contents_fixups = ();

my @current_contents_list = ();

# Merge @current_contents_list into @contents_list,
# and set @current_contents_list to be empty.
sub merge_contents_lists ( )
{ check_args(0, @_);

  # Three possibilities:
  #  * @contents_list is empty: replace it by @current_contents_list.
  #  * prefixes of the two lists are identical: do nothing
  #  * @current_contents_list is all at lower level than $contents_list[0];
  #    prefix @contents_list by @current_contents_list

  if (scalar(@current_contents_list) == 0)
    { die "empty current_contents_list"; }

  #   if (scalar(@contents_list) == 0)
  #     { @contents_list = @current_contents_list;
  #       @current_contents_list = ();
  #       return; }

  #   if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
  #     { unshift @contents_list, @current_contents_list;
  #       @current_contents_list = ();
  #       return; }

  for (my $i=0; $i<scalar(@current_contents_list); $i++)
    { my $ref_c_tdf = $current_contents_list[$i];
      if ($i >= scalar(@contents_list))
	{ push @contents_list, $ref_c_tdf;
	  my $title = $ {$ref_c_tdf}[0];
	  if (defined $contents_hash{$title})
	    { $contents_fixups{$title} = 1; }
	  else
	    { $contents_hash{$title} = 1; }
	  next; }
      my $ref_tdf = $contents_list[$i];
      my ($title, $depth, $file) = @{$ref_tdf};
      my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};

      if (($title ne $c_title)
	  && ($depth < $c_depth)
	  && ($file ne $c_file))
	{ splice @contents_list, $i, 0, $ref_c_tdf;
	  if (defined $contents_hash{$c_title})
	    { $contents_fixups{$c_title} = 1; }
	  else
	    { $contents_hash{$c_title} = 1; }
	  next; }

      if (($title ne $c_title)
	  || ($depth != $c_depth)
	  || ($file ne $c_file))
	{ die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
	       "\n  main:  <<<$title>>> $depth $file",
	       "\n  curr:  <<<$c_title>>> $c_depth $c_file"); }
    }
  @current_contents_list = ();
}



# Set @current_contents_list to a list of (title, href, sectionlevel);
#  then merge that list into @contents_list.
# Maybe this function should also produce a map
#  from title (or href) to sectionlevel (eg "chapter"?).
sub process_child_links ( $ )
{ my ($he) = check_args(1, @_);

  # $he->dump();
  if (scalar(@current_contents_list) != 0)
    { die "current_contents_list nonempty: @current_contents_list"; }
  $he->traverse(\&increment_current_contents_list, 'ignore text');

  # Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
  my %depths = ();
  for my $ref_tdf (@current_contents_list)
    { $depths{$ {$ref_tdf}[1]} = 1; }
  my @sorted_depths = sort keys %depths;
  my $current_depth = scalar(@section_stack)-1;
  my $current_depth_2 = $ {$current_ref_tdf}[1];
  if ($current_depth != $current_depth_2)
    { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
  for (my $i=0; $i<scalar(@sorted_depths); $i++)
    { $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
  for my $ref_tdf (@current_contents_list)
    { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }

  # Eliminate uninteresting sections.  Hard-coded hack for now.
  if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
    { pop @current_contents_list; }
  if ((scalar(@current_contents_list) > 1)
      && ($ {$current_contents_list[1]}[0] eq "Contents"))
    { my $ref_first_tdf = shift @current_contents_list;
      $current_contents_list[0] = $ref_first_tdf; }

  for (my $i=0; $i<scalar(@current_contents_list); $i++)
    { my $ref_tdf = $current_contents_list[$i];
      my $title = $ {$ref_tdf}[0];
      if (exists $index_info{$title})
	{ my $index_file = $ {$ref_tdf}[2];
	  my ($indexing_command, $suffix) = @{$index_info{$title}};
	  process_index_file($index_file, $indexing_command);
	  print TEXI "\n\@defindex $suffix\n";
	  push @index_titles, $title;
	  splice @current_contents_list, $i, 1;
	  $i--; }
      elsif ($title =~ /\bIndex$/)
	{ print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }

  merge_contents_lists();

  # print_contents_list();
  # print_index_info();
}


sub increment_current_contents_list ( $$$ )
{ my ($he, $startflag, $depth) = check_args(3, @_);
  if (!$startflag)
    { return; }

  if ($he->tag eq "li")
    { my @li_content = @{$he->content};
      if ($li_content[0]->tag ne "a")
	{ die "first element of <LI> should be <A>"; }
      my ($name, $href, @content) = anchor_info($li_content[0]);
      # unused $name
      my $title = join("", collect_texts($li_content[0]));
      $title = texi_remove_punctuation($title);
      # The problem with these is that they are formatted differently in
      # @menu and @node!
      $title =~ s/``/\"/g;
      $title =~ s/''/\"/g;
      $title =~ s/ -- / /g;
      push @current_contents_list, [ $title, $depth, $href ]; }
  return 1;
}

# Simple version for section titles
sub html_to_texi ( $ )
{ my ($he) = check_args(1, @_);
  if (!ref $he)
    { return $he; }

  my $tag = $he->tag;
  if (exists $inline_markup{$tag})
    { my $result = "\@$inline_markup{$tag}\{";
      for my $elt (@{$he->content})
	{ $result .= html_to_texi($elt); }
      $result .= "\}";
      return $result; }
  else
    { $he->dump();
      die "html_to_texi confused by <$tag>"; }
}



sub print_contents_list ()
{ check_args(0, @_);
  print STDERR "Contents list:\n";
  for my $ref_tdf (@contents_list)
    { my ($title, $depth, $file) = @{$ref_tdf};
      print STDERR "$title $depth $file\n"; }
}



###########################################################################
### Index
###

my $l2h_broken_link_name = "l2h-";


# map from file to (map from anchor name to (list of index texts))
# (The list is needed when a single LaTeX command like \envvar
# expands to multiple \index commands.)
my %file_index_entries = ();
my %this_index_entries;		# map from anchor name to (list of index texts)

my %file_index_entries_broken = (); # map from file to (list of index texts)
my @this_index_entries_broken;

my $index_prefix = "";
my @index_prefixes = ();

my $this_indexing_command;

sub print_index_info ()
{ check_args(0, @_);
  my ($key, $val);
  for my $file (sort keys %file_index_entries)
    { my %index_entries = %{$file_index_entries{$file}};
      print STDERR "file: $file\n";
      for my $aname (sort keys %index_entries)
	{ my @entries = @{$index_entries{$aname}};
	  if (scalar(@entries) == 1)
	    { print STDERR "  $aname : $entries[0]\n"; }
	  else
	    { print STDERR "  $aname : ", join("\n     " . (" " x length($aname)), @entries), "\n"; } } }
  for my $file (sort keys %file_index_entries_broken)
    { my @entries = @{$file_index_entries_broken{$file}};
      print STDERR "file: $file\n";
      for my $entry (@entries)
	{ print STDERR "  $entry\n"; }
    }
}
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -