📄 html2texi.pl
字号:
$elt->dump;
die "Nonsimple anchor in <$tag>"; }
if (!defined $name)
{ die "No NAME for anchor in $tag"; }
push @seclabels, $name;
for my $subelt (@acontent)
{ $secname .= html_to_texi($subelt); } }
else
{ $secname .= html_to_texi($elt); } }
if ($secname eq "")
{ die "No section name in <$tag>"; }
if (scalar(@section_stack) == 1)
{ if ($section_stack[-1] ne "Top")
{ die "Not top? $section_stack[-1]"; }
print TEXI "\@settitle $secname\n";
print TEXI "\@c %**end of header\n";
print TEXI "\n";
print TEXI "\@node Top\n";
print TEXI "\n"; }
else
{ print TEXI "\n\@node $section_stack[-1]\n";
print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
for my $seclabel (@seclabels)
{ label_add_index_entries($seclabel); }
# This should only happen once per file.
label_add_index_entries("");
if (scalar(@index_deferrers) != 0)
{ $he->dump;
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
do_deferred_index_entries();
return 0;
}
elsif ($tag eq "hr")
{ }
elsif ($tag eq "ignore")
{ # Hack for ignored elements
return 0;
}
elsif ($tag eq "li")
{ if ($startflag)
{ print TEXI "\n\n\@item\n";
do_deferred_index_entries(); } }
elsif ($tag eq "ol")
{ if ($startflag)
{ print TEXI "\n\@enumerate \@bullet\n"; }
else
{ print TEXI "\n\@end enumerate\n"; } }
elsif ($tag eq "p")
{ if ($startflag)
{ print TEXI "\n\n"; }
if (scalar(@index_deferrers) != 0)
{ $he->dump;
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
do_deferred_index_entries(); }
elsif ($tag eq "pre")
{ print_pre($he);
return 0; }
elsif ($tag eq "table")
{ # Could also indicate common formatting for first column, or
# determine relative widths for columns (or determine a prototype row)
if ($startflag)
{ if (defined $table_columns)
{ $he->dump;
die "Can't deal with table nested inside $table_columns-column table"; }
$table_columns = table_columns($he);
if ($table_columns < 2)
{ $he->dump;
die "Column with $table_columns columns?"; }
elsif ($table_columns == 2)
{ print TEXI "\n\@table \@asis\n"; }
else
{ print TEXI "\n\@multitable \@columnfractions";
for (my $i=0; $i<$table_columns; $i++)
{ print TEXI " ", 1.0/$table_columns; }
print TEXI "\n"; } }
else
{ if ($table_columns == 2)
{ print TEXI "\n\@end table\n"; }
else
{ print TEXI "\n\@end multitable\n"; }
undef $table_columns; } }
elsif (($tag eq "td") || ($tag eq "th"))
{ if ($startflag)
{ if ($table_first_column)
{ print TEXI "\n\@item ";
$table_first_column = 0; }
elsif ($table_columns > 2)
{ print TEXI "\n\@tab "; } }
else
{ print TEXI "\n"; } }
elsif ($tag eq "tr")
{ if ($startflag)
{ $table_first_column = 1; } }
elsif ($tag eq "ul")
{ if ($startflag)
{ print TEXI "\n\@itemize \@bullet\n"; }
else
{ print TEXI "\n\@end itemize\n"; } }
else
{ # I used to have a newline before "output_body" here.
print STDERR "output_body: ignoring <$tag> tag\n";
$he->dump;
return 0; }
return 1;
}
sub print_pre ( $ )
{ my ($he_pre) = check_args(1, @_);
if (!has_single_content_string($he_pre))
{ die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
my $pre_content = $ {$he_pre->content}[0];
print TEXI "\n\@example";
print TEXI &texi_quote($pre_content);
print TEXI "\@end example\n";
}
sub table_columns ( $ )
{ my ($table) = check_args(1, @_);
my $result = 0;
for my $row (@{$table->content})
{ if ($row->tag ne "tr")
{ $table->dump;
$row->dump;
die "Expected <TR> as table row."; }
$result = max($result, scalar(@{$row->content})); }
return $result;
}
###########################################################################
### Utilities
###
sub min ( $$ )
{ my ($x, $y) = check_args(2, @_);
return ($x < $y) ? $x : $y;
}
sub max ( $$ )
{ my ($x, $y) = check_args(2, @_);
return ($x > $y) ? $x : $y;
}
sub file_to_tree ( $ )
{ my ($file) = check_args(1, @_);
my $tree = new HTML::TreeBuilder;
$tree->ignore_unknown(1);
# $tree->warn(1);
$tree->parse_file($file);
cleanup_parse_tree($tree);
return $tree
}
sub has_single_content ( $ )
{ my ($he) = check_args(1, @_);
if (!ref $he)
{ # return 0;
die "Non-reference argument: $he"; }
my $ref_content = $he->content;
if (!defined $ref_content)
{ return 0; }
my @content = @{$ref_content};
if (scalar(@content) != 1)
{ return 0; }
return 1;
}
# Return true if the content of the element contains only one element itself,
# and that inner element has the specified tag.
sub has_single_content_with_tag ( $$ )
{ my ($he, $tag) = check_args(2, @_);
if (!has_single_content($he))
{ return 0; }
my $content = $ {$he->content}[0];
if (!ref $content)
{ return 0; }
my $content_tag = $content->tag;
if (!defined $content_tag)
{ return 0; }
return $content_tag eq $tag;
}
sub has_single_content_string ( $ )
{ my ($he) = check_args(1, @_);
if (!has_single_content($he))
{ return 0; }
my $content = $ {$he->content}[0];
if (ref $content)
{ return 0; }
return 1;
}
# Return name, href, content. First two may be undefined; third is an array.
# I don't see how to determine if there are more attributes.
sub anchor_info ( $ )
{ my ($he) = check_args(1, @_);
if ($he->tag ne "a")
{ $he->dump;
die "passed non-anchor to anchor_info"; }
my $name = $he->attr('name');
my $href = $he->attr('href');
my @content = ();
{ my $ref_content = $he->content;
if (defined $ref_content)
{ @content = @{$ref_content}; } }
return ($name, $href, @content);
}
sub texi_quote ( $ )
{ my ($text) = check_args(1, @_);
$text =~ s/([\@\{\}])/\@$1/g;
$text =~ s/ -- / --- /g;
return $text;
}
# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
sub texi_remove_punctuation ( $ )
{ my ($text) = check_args(1, @_);
$text =~ s/^ +//g;
$text =~ s/[ :]+$//g;
$text =~ s/^[1-9][0-9.]* +//g;
$text =~ s/,//g;
# Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
# gets converted into " - ", just as "---" would be converted into " -- ",
# so the names end up differing.)
# $text =~ s/:/ -- /g;
$text =~ s/://g;
return $text;
}
## Do not use this inside `traverse': it throws off the traversal. Use
## html_replace_by_ignore or html_replace_by_meta instead.
# Returns 1 if success, 0 if failure.
sub html_remove ( $;$ )
{ my ($he, $parent) = check_args_range(1, 2, @_);
if (!defined $parent)
{ $parent = $he->parent; }
my $ref_pcontent = $parent->content;
my @pcontent = @{$ref_pcontent};
for (my $i=0; $i<scalar(@pcontent); $i++)
{ if ($pcontent[$i] eq $he)
{ splice @{$ref_pcontent}, $i, 1;
$he->parent(undef);
return 1; } }
die "Didn't find $he in $parent";
}
sub html_replace ( $$;$ )
{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
if (!defined $parent)
{ $parent = $orig->parent; }
my $ref_pcontent = $parent->content;
my @pcontent = @{$ref_pcontent};
for (my $i=0; $i<scalar(@pcontent); $i++)
{ if ($pcontent[$i] eq $orig)
{ $ {$ref_pcontent}[$i] = $new;
$new->parent($parent);
$orig->parent(undef);
return 1; } }
die "Didn't find $orig in $parent";
}
sub html_replace_by_meta ( $;$ )
{ my ($orig, $parent) = check_args_range(1, 2, @_);
my $meta = new HTML::Element "meta";
if (!defined $parent)
{ $parent = $orig->parent; }
return html_replace($orig, $meta, $parent);
}
sub html_replace_by_ignore ( $;$ )
{ my ($orig, $parent) = check_args_range(1, 2, @_);
my $ignore = new HTML::Element "ignore";
if (!defined $parent)
{ $parent = $orig->parent; }
return html_replace($orig, $ignore, $parent);
}
###
### Collect text elements
###
my @collected_texts;
my $collect_texts_stoppoint;
my $done_collecting;
sub collect_texts ( $;$ )
{ my ($root, $stop) = check_args_range(1, 2, @_);
# print STDERR "collect_texts: $root $stop\n";
$collect_texts_stoppoint = $stop;
$done_collecting = 0;
@collected_texts = ();
$root->traverse(\&collect_if_text); # process texts
# print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
return @collected_texts;
}
sub collect_if_text ( $$$ )
{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments
if ($done_collecting)
{ return 0; }
if (!defined $he)
{ return 0; }
if (!ref $he)
{ push @collected_texts, $he;
return 0; }
if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
{ $done_collecting = 1;
return 0; }
return 1;
}
###########################################################################
### Clean up parse tree
###
sub cleanup_parse_tree ( $ )
{ my ($he) = check_args(1, @_);
$he->traverse(\&delete_if_navigation, 'ignore text');
$he->traverse(\&delete_extra_spaces, 'ignore text');
$he->traverse(\&merge_dl, 'ignore text');
$he->traverse(\&reorder_dt_and_dl, 'ignore text');
return $he;
}
## Simpler version that deletes contents but not the element itself.
# sub delete_if_navigation ( $$$ )
# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
# { $he->delete();
# return 0; }
# else
# { return 1; }
# }
sub delete_if_navigation ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
{ my $ref_pcontent = $he->parent()->content();
# Don't try to modify @pcontent, which appears to be a COPY.
# my @pcontent = @{$ref_pcontent};
for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
{ if (${$ref_pcontent}[$i] eq $he)
{ splice(@{$ref_pcontent}, $i, 1);
last; } }
$he->delete();
return 0; }
else
{ return 1; }
}
sub delete_extra_spaces ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
{ return; }
my $tag = $he->tag;
if ($tag =~ /^(head|html|table|tr|ul)$/)
{ delete_child_spaces($he); }
delete_trailing_spaces($he);
return 1;
}
sub delete_child_spaces ( $ )
{ my ($he) = check_args(1, @_);
my $ref_content = $he->content();
for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
{ if ($ {$ref_content}[$i] =~ /^ *$/)
{ splice(@{$ref_content}, $i, 1);
$i--; } }
}
sub delete_trailing_spaces ( $ )
{ my ($he) = check_args(1, @_);
my $ref_content = $he->content();
if (! defined $ref_content)
{ return; }
# Could also check for previous element = /^h[1-6]$/.
for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
{ if ($ {$ref_content}[$i] =~ /^ *$/)
{ my $next_elt = $ {$ref_content}[$i+1];
if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
{ splice(@{$ref_content}, $i, 1);
$i--; } } }
if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
{ my $last_elt = $ {$ref_content}[$#{$ref_content}];
if ((defined $last_elt) && ($last_elt =~ /^ *$/))
{ pop @{$ref_content}; } }
}
# LaTeX2HTML sometimes creates
# <DT>text
# <DL COMPACT><DD>text
# which should actually be:
# <DL COMPACT>
# <DT>text
# <DD>text
# Since a <DL> gets added, this ends up looking like
# <P>
# <DL>
# <DT>
# text1...
# <DL COMPACT>
# <DD>
# text2...
# dt_or_dd1...
# dt_or_dd2...
# which should become
# <P>
# <DL COMPACT>
# <DT>
# text1...
# <DD>
# text2...
# dt_or_dd1...
# dt_or_dd2...
sub reorder_dt_and_dl ( $$$ )
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
if (!$startflag)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -