📄 html2texi.pl

📁 python s60 1.4.5版本的源代码
💻 PL
📖 第 1 页 / 共 4 页
字号:

sub process_index_file ( $$ )
{ my ($file, $indexing_command) = check_args(2, @_);
  # print "process_index_file $file $indexing_command\n";

  my $he = file_to_tree($html_directory . $file);
  # $he->dump();

  $this_indexing_command = $indexing_command;
  $he->traverse(\&process_if_index_dl_compact, 'ignore text');
  undef $this_indexing_command;
  # print "process_index_file done\n";
}


sub process_if_index_dl_compact ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument
  if (!$startflag)
    { return; }

  if (($he->tag() eq "dl") && (defined $he->attr('compact')))
    { process_index_dl_compact($he);
      return 0; }
  else
    { return 1; }
}


# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
#  * a single space: text to be ignored
#  * <DT> elements with an optional <DD> element following each one
#    Two types of <DT> elements:
#     * Followed by a <DD> element:  the <DT> contains a single
#       string, and the <DD> contains a whitespace string to be ignored, a
#       <DL COMPACT> to be recursively processed (with the <DT> string as a
#       prefix), and a whitespace string to be ignored.
#     * Not followed by a <DD> element:  contains a list of anchors
#       and texts (ignore the texts, which are only whitespace and commas).
#       Optionally contains a <DL COMPACT> to be recursively processed (with
#       the <DT> string as a prefix)
sub process_index_dl_compact ( $ )
{ my ($h) = check_args(1, @_);
  my @content = @{$h->content()};
  for (my $i = 0; $i < scalar(@content); $i++)
    { my $this_he = $content[$i];
      if ($this_he->tag ne "dt")
	{ $this_he->dump();
	  die "Expected <DT> tag: " . $this_he->tag; }
      if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
	{ process_index_dt_and_dd($this_he, $content[$i+1]);
	  $i++;	}
      else
	{ process_index_lone_dt($this_he); } } }



# Argument is a <DT> element.  If it contains more than one anchor, then
# the texts of all subsequent ones are "[Link]".  Example:
#       <DT>
#         <A HREF="embedding.html#l2h-201">
#           "$PATH"
#         ", "
#         <A HREF="embedding.html#l2h-205">
#           "[Link]"
# Optionally contains a <DL COMPACT> as well.  Example:
# <DT>
#   <A HREF="types.html#l2h-616">
#     "attribute"
#   <DL COMPACT>
#     <DT>
#       <A HREF="assignment.html#l2h-3074">
#         "assignment"
#       ", "
#       <A HREF="assignment.html#l2h-3099">
#         "[Link]"
#     <DT>
#       <A HREF="types.html#l2h-">
#         "assignment, class"

sub process_index_lone_dt ( $ )
{ my ($dt) = check_args(1, @_);
  my @dtcontent = @{$dt->content()};
  my $acontent;
  my $acontent_suffix;
  for my $a (@dtcontent)
    { if ($a eq ", ")
	{ next; }
      if (!ref $a)
	{ $dt->dump;
	  die "Unexpected <DT> string element: $a"; }

      if ($a->tag eq "dl")
	{ push @index_prefixes, $index_prefix;
	  if (!defined $acontent_suffix)
	    { die "acontent_suffix not yet defined"; }
	  $index_prefix .= $acontent_suffix . ", ";
	  process_index_dl_compact($a);
	  $index_prefix = pop(@index_prefixes);
	  return; }

      if ($a->tag ne "a")
	{ $dt->dump;
	  $a->dump;
	  die "Expected anchor in lone <DT>"; }

      my ($aname, $ahref, @acontent) = anchor_info($a);
      # unused $aname
      if (scalar(@acontent) != 1)
	{ die "Expected just one content of <A> in <DT>: @acontent"; }
      if (ref $acontent[0])
	{ $acontent[0]->dump;
	  die "Expected string content of <A> in <DT>: $acontent[0]"; }
      if (!defined($acontent))
	{ $acontent = $index_prefix . $acontent[0];
	  $acontent_suffix = $acontent[0]; }
      elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
	{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }

      if (!defined $ahref)
	{ $dt->dump;
	  die "no HREF in nachor in <DT>"; }
      my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
      if (!defined $ahref_name)
	{ # Reference to entire file
	  $ahref_name = ""; }

      if ($ahref_name eq $l2h_broken_link_name)
	{ if (!exists $file_index_entries_broken{$ahref_file})
	    { $file_index_entries_broken{$ahref_file} = []; }
	  push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
	  next; }

      if (!exists $file_index_entries{$ahref_file})
	{ $file_index_entries{$ahref_file} = {}; }
      # Don't do this!  It appears to make a copy, which is not desired.
      # my %index_entries = %{$file_index_entries{$ahref_file}};
      if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
	{ $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
      # 	{ my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
      # 	  if ($acontent eq $oldcontent)
      # 	    { die "Multiple identical index entries?"; }
      # 	  die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }

      push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
      # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
    }
}

sub process_index_dt_and_dd ( $$ )
{ my ($dt, $dd) = check_args(2, @_);
  my $dtcontent;
  { my @dtcontent = @{$dt->content()};
    if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
      { $dd->dump;
	$dt->dump;
	die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
    $dtcontent = $dtcontent[0];
    $dtcontent =~ s/ +$//; }
  my $ddcontent;
  { my @ddcontent = @{$dd->content()};
    if (scalar(@ddcontent) != 1)
      { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
    $ddcontent = $ddcontent[0]; }
  if ($ddcontent->tag ne "dl")
    { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }

  push @index_prefixes, $index_prefix;
  $index_prefix .= $dtcontent . ", ";
  process_index_dl_compact($ddcontent);
  $index_prefix = pop(@index_prefixes);
}


###########################################################################
### Ordinary sections
###

sub process_section_file ( $$$ )
{ my ($file, $depth, $nodetitle) = check_args(3, @_);
  my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);

  # print STDERR "process_section_file: $file $depth $nodetitle\n";

  # Equivalently:
  #   while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
  @section_stack = @section_stack[0..$depth-1];

  # Not a great nodename fixup scheme; need a more global view
  if ((defined $contents_fixups{$nodetitle})
      && (scalar(@section_stack) > 0))
    { my $up_title = $section_stack[$#section_stack];
      # hack for Python Standard Library
      $up_title =~ s/^(Built-in|Standard) Module //g;
      my ($up_first_word) = split(/ /, $up_title);
      $nodetitle = "$up_first_word $nodetitle";
    }

  push @section_stack, $nodetitle;
  # print STDERR "new section_stack: ", join(", ", @section_stack), "\n";

  $he->traverse(\&process_if_child_links, 'ignore text');
  %footnotes = ();
  # $he->dump;
  $he->traverse(\&process_if_footnotes, 'ignore text');

  # $he->dump;

  if (exists $file_index_entries{$file})
    { %this_index_entries = %{$file_index_entries{$file}};
      # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
    }
  else
    { # print STDERR "Warning: no index entries for file $file\n";
      %this_index_entries = (); }

  if (exists $file_index_entries_broken{$file})
    { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
  else
    { # print STDERR "Warning: no index entries for file $file\n";
      @this_index_entries_broken = (); }


  if ($he->tag() ne "html")
    { die "Expected <HTML> at top level"; }
  my @content = @{$he->content()};
  if ((!ref $content[0]) or ($content[0]->tag ne "head"))
    { $he->dump;
      die "<HEAD> not first element of <HTML>"; }
  if ((!ref $content[1]) or ($content[1]->tag ne "body"))
    { $he->dump;
      die "<BODY> not second element of <HTML>"; }

  $content[1]->traverse(\&output_body);
}

# stack of things we're inside that are preventing indexing from occurring now.
# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
my @index_deferrers = ();

sub push_or_pop_index_deferrers ( $$ )
{ my ($tag, $startflag) = check_args(2, @_);
  if ($startflag)
    { push @index_deferrers, $tag; }
  else
    { my $old_deferrer = pop @index_deferrers;
      if ($tag ne $old_deferrer)
	{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
      do_deferred_index_entries(); }
}


sub label_add_index_entries ( $;$ )
{ my ($label, $he) = check_args_range(1, 2, @_);
  # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
  # $he is the anchor element
  if (exists $this_index_entries{$label})
    { push @deferred_index_entries, @{$this_index_entries{$label}};
      return; }

  if ($label eq $l2h_broken_link_name)
    { # Try to find some text to use in guessing which links should point here
      # I should probably only look at the previous element, or if that is
      # all punctuation, the one before it; collecting all the previous texts
      # is a bit of overkill.
      my @anchor_texts = collect_texts($he);
      my @previous_texts = collect_texts($he->parent, $he);
      # 4 elements is arbitrary; ought to filter out punctuation and small words
      # first, then perhaps keep fewer.  Perhaps also filter out formatting so
      # that we can see a larger chunk of text?  (Probably not.)
      # Also perhaps should do further chunking into words, in case the
      # index term isn't a chunk of its own (eg, was in <tt>...</tt>.
      my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);

      my $guessed = 0;
      for my $text (@candidate_texts)
	{ # my $orig_text = $text;
	  if ($text =~ /^[\"\`\'().?! ]*$/)
	    { next; }
	  if (length($text) <= 2)
	    { next; }
	  # hack for Python manual; maybe defer until failure first time around?
	  $text =~ s/^sys\.//g;
	  for my $iterm (@this_index_entries_broken)
	    { # I could test for zero:  LaTeX2HTML's failures in the Python
	      # documentation are only for items of the form "... (built-in...)"
	      if (index($iterm, $text) != -1)
		{ push @deferred_index_entries, $iterm;
		  # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
		  $guessed = 1;
		} } }
      if (!$guessed)
	{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
	}
    }
}


# Need to add calls to this at various places.
# Perhaps add HTML::Element argument and do the check for appropriateness
# here (ie, no action if inside <H1>, etc.).
sub do_deferred_index_entries ()
{ check_args(0, @_);
  if ((scalar(@deferred_index_entries) > 0)
      && (scalar(@index_deferrers) == 0))
    { print TEXI "\n", join("\n", @deferred_index_entries), "\n";
      @deferred_index_entries = (); }
}

my $table_columns;		# undefined if not in a table
my $table_first_column;		# boolean

sub output_body ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; #  ignore depth argument

  if (!ref $he)
    { my $space_index = index($he, " ");
      if ($space_index != -1)
	{ # Why does
	  #   print TEXI texi_quote(substr($he, 0, $space_index+1));
	  # give:  Can't locate object method "TEXI" via package "texi_quote"
	  # (Because the definition texi_quote hasn't been seen yet.)
	  print TEXI &texi_quote(substr($he, 0, $space_index+1));
	  do_deferred_index_entries();
	  print TEXI &texi_quote(substr($he, $space_index+1)); }
      else
	{ print TEXI &texi_quote($he); }
      return; }

  my $tag = $he->tag();

  # Ordinary text markup first
  if (exists $inline_markup{$tag})
    { if ($startflag)
	{ print TEXI "\@$inline_markup{$tag}\{"; }
      else
	{ print TEXI "\}"; } }
  elsif ($tag eq "a")
    { my ($name, $href, @content) = anchor_info($he);
      if (!$href)
	{ # This anchor is only here for indexing/cross referencing purposes.
	  if ($startflag)
	    { label_add_index_entries($name, $he); }
	}
      elsif ($href =~ "^(ftp|http|news):")
	{ if ($startflag)
	    { # Should avoid second argument if it's identical to the URL.
	      print TEXI "\@uref\{$href, "; }
	  else
	    { print TEXI "\}"; }
	}
      elsif ($href =~ /^\#(foot[0-9]+)$/)
	{ # Footnote
	  if ($startflag)
	    { # Could double-check name and content, but I'm not
	      # currently storing that information.
	      print TEXI "\@footnote\{";
	      $footnotes{$1}->traverse(\&output_body);
	      print TEXI "\}";
	      return 0; } }
      else
	{ if ($startflag)
	    { # cross-references are not active Info links, but no text is lost
	      print STDERR "Can't deal with internal HREF anchors yet:\n";
	      $he->dump; }
	}
    }
  elsif ($tag eq "br")
    { print TEXI "\@\n"; }
  elsif ($tag eq "body")
    { }
  elsif ($tag eq "center")
    { if (has_single_content_string($he)
	  && ($ {$he->content}[0] =~ /^ *$/))
	{ return 0; }
      if ($startflag)
	{ print TEXI "\n\@center\n"; }
      else
	{ print TEXI "\n\@end center\n"; }
    }
  elsif ($tag eq "div")
    { my $align = $he->attr('align');
      if (defined($align) && ($align eq "center"))
	{ if (has_single_content_string($he)
	      && ($ {$he->content}[0] =~ /^ *$/))
	    { return 0; }
	  if ($startflag)
	    { print TEXI "\n\@center\n"; }
	  else
	    { print TEXI "\n\@end center\n"; } }
    }
  elsif ($tag eq "dl")
    { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
      if (has_single_content_with_tag($he, "dd"))
	{ my $he_dd = $ {$he->content}[0];
	  if (has_single_content_with_tag($he_dd, "pre"))
	    { my $he_pre = $ {$he_dd->content}[0];
	      print_pre($he_pre);
	      return 0; } }
      if ($startflag)
	{ # Could examine the elements, to be cleverer about formatting.
	  # (Also to use ftable, vtable...)
	  print TEXI "\n\@table \@asis\n"; }
      else
	{ print TEXI "\n\@end table\n"; }
    }
  elsif ($tag eq "dt")
    { push_or_pop_index_deferrers($tag, $startflag);
      if ($startflag)
	{ print TEXI "\n\@item "; }
      else
	{ } }
  elsif ($tag eq "dd")
    { if ($startflag)
	{ print TEXI "\n"; }
      else
	{ }
      if (scalar(@index_deferrers) != 0)
	{ $he->dump;
	  die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
      do_deferred_index_entries();
    }
  elsif ($tag =~ /^(font|big|small)$/)
    { # Do nothing for now.
    }
  elsif ($tag =~ /^h[1-6]$/)
    { # We don't need this because we never recursively enter the heading content.
      # push_or_pop_index_deferrers($tag, $startflag);
      my $secname = "";
      my @seclabels = ();
      for my $elt (@{$he->content})
	{ if (!ref $elt)
	    { $secname .= $elt; }
	  elsif ($elt->tag eq "br")
	    { }
	  elsif ($elt->tag eq "a")
	    { my ($name, $href, @acontent) = anchor_info($elt);
              if ($href)
                { $he->dump;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -