📄 html-sum.pl
字号:
: # *-*-perl-*-* eval 'exec perl -S $0 "$@"' if $running_under_some_shell;## HTML-sum.pl## $Id: HTML-sum.pl,v 2.2 2000/02/03 12:45:55 sxw Exp $## andy powell <a.powell@ukoln.ac.uk>################################################################################ Harvest Indexer http://harvest.sourceforge.net/# -----------------------------------------------## The Harvest Indexer is a continued development of code developed by# the Harvest Project. Development is carried out by numerous individuals# in the Internet community, and is not officially connected with the# original Harvest Project or its funding sources.## Please mail lee@arco.de if you are interested in participating# in the development effort.## This program is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program; if not, write to the Free Software# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.############################################################################ Parse HTML file to generate SOIF record on STDOUT## Usage: HTML-sum.pl [-u URL] filename## Giving the -u URL option causes a complete SOIF record to be generated# incluing an opening## @FILE { URL## and a closing## }## Otherwise a simple list of SOIF attributes is generated which is what# Harvest expects from HTML-sum.pl.$ENV{'HARVEST_HOME'} = "/usr/local/harvest" if (!defined($ENV{'HARVEST_HOME'}));unshift(@INC, "$ENV{'HARVEST_HOME'}/lib"); # use local filesrequire "soif.pl";require HTML::Parser;use Getopt::Std;use HTML::Entities ();package P; @ISA = qw(HTML::Parser);# Change these as desired...#$createURLTexts = 1; # Create SOIF 'url-texts' attribute to match # 'url-references'$createComments = 1; # Create SOIF 'comments' attribute from HTML comments$createScriptLanguages = 1; # Create SOIF 'script-languages' attribute to # indicate scripting languages used on page$createApplets = 1; # Create SOIF 'applets' attribute from HTML applet tags$splitFullText = 1; # Add blank lines to SOIF 'full-text' attribute on some # tags - may help with later text processing of SOIF # record?@paraTags = ( # List of tags that cause a paragraph break... 'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', 'dl', 'pre', 'div', 'center', 'blockquote', 'form', 'hr', 'table',);@keytags = ( # List of tags that go into SOIF keywords... 'b', 'i', 'a', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',);Getopt::Std::getopts('u:');$url = $Getopt::Std::opt_u;die "Usage: HTML-sum.pl [-u URL] filename" unless ($file = shift(@ARGV));# Called for each opening HTML tag#sub start{ my($self, $tag, $attr) = @_; my($name, $content, $href, $code, $alt, $src); if ($tag eq 'title') { $titletext++; } if ($tag =~ /^h[1-6]$/) { $headingstext++; } if (grep(/^$tag$/, @keytags)) { $keywordstext++; } if ($tag eq 'meta') { for (keys %$attr) { $name = $attr->{$_} if (/^name$/i); $content = $attr->{$_} if (/^content$/i); } $name =~ tr/A-Z/a-z/; # ignore case of name if ($name eq 'keywords') { # Treat this specially - we'll use it # later to over-ride generated value push(@metakeywords, $content); } else { push(@metaname, $name) unless $meta{$name}; $meta{$name} .= "$content\n"; } } if ($tag eq 'a') { for (keys %$attr) { $href = $attr->{$_} if (/^href$/i); } push(@soif'urlreferences, $href); $urltext++; } if ($tag eq 'script') { for (keys %$attr) { $lang = $attr->{$_} if (/^language$/i); } $soif'scriptlanguages{$lang}++; $inscript++; } if (grep(/^$tag$/, @paraTags)) { push(@soif'fulltext, "\n") if $splitFullText; } if ($tag eq 'applet') { for (keys %$attr) { $code = $attr->{$_} if (/^code$/i); } push(@soif'applets, $code); } if ($tag eq 'img') { for (keys %$attr) { $alt = $attr->{$_} if (/^alt$/i); $src = $attr->{$_} if (/^src$/i); } push(@soif'fulltext, "\n\n$alt\n\n"); push(@soif'urlreferences, $src); push(@soif'urltexts, "$alt\n"); }}# Called for each closing HTML tag#sub end{ my($self, $tag) = @_; if ($tag eq 'title') { $titletext--; undef $titletext if ($titletext < 0); } if ($tag =~ /^h[1-6]$/) { $headingstext--; undef $headingstext if ($headingstext < 0); } if (grep(/^$tag$/, @keytags)) { $keywordstext--; undef $keywordstext if ($keywordstext < 0); } if ($tag eq 'a') { push(@soif'urltexts, "\n"); $urltext--; undef $urltext if ($urltext < 0); } if ($tag eq 'script') { $inscript--; undef $inscript if ($inscript < 0); } if (grep(/^$tag$/, @paraTags)) { push(@soif'fulltext, "\n") if $splitFullText; }}# Called for each bit of text#sub text{ my $self = shift; my $t = $_[0]; return if $inscript; # Chuck out any script text # Strip leading newline $t =~ s/^\n+//;# $t =~ s/\s+/ /g; # Multiple spaces or tabs to single space... $t =~ s/[ \t]+/ /g; # Multiple newlines to single newline... $t =~ s/\n+/\n/g; # Throw out whitespace only lines return if ($t =~ /^\s*$/); $t = HTML::Entities::decode_entities($t); if ($t) { push(@soif'fulltext, $t); # Remove leading and trailing whitespace... $t =~ s/^\s*//; $t =~ s/\s*$//; push(@soif'title, $t) if $titletext; push(@soif'headings, $t) if $headingstext; $soif'keywords{$t}++ if $keywordstext; push(@soif'urltexts, $t) if $urltext; }}# Called for each HTML comment#sub comment{ my $self = shift; return if $inscript; # Chuck out any script text push(@soif'comments, $_[0]);}# Parse the HTML file#$p = new P;$p->parse_file($file);# Generate SOIF attribute/values#print "\@FILE { $url\n" if $url;&soif'print_av('title', join(" ", @soif'title)) if @soif'title;&soif'print_av('headings', join("\n", @soif'headings)) if @soif'headings;foreach (keys %soif'keywords) { push(@soif'keywords, $_);}@soif'keywords = @metakeywords if (@metakeywords);&soif'print_av('keywords', join("\n", sort @soif'keywords)) if @soif'keywords;if (@soif'fulltext) { $tmp = join("\n", @soif'fulltext); $tmp =~ s/\n\n+/\n\n/g; $tmp =~ s/^\s*//; $tmp =~ s/\s*$//; &soif'print_av('full-text', $tmp);}&soif'print_av('url-references', join("\n", @soif'urlreferences)) if @soif'urlreferences;if (@soif'urltexts && $createURLTexts) { $tmp = join("", @soif'urltexts); $tmp =~ s/^\s*//; $tmp =~ s/\s*$//; &soif'print_av('url-texts', $tmp);}foreach (@metaname) { $meta{$_} =~ s/\s+$//; &soif'print_av($_, $meta{$_});}&soif'print_av('comments', join("\n", @soif'comments)) if (@soif'comments && $createComments);foreach (keys %soif'scriptlanguages) { push(@soif'scriptlanguages, $_);}&soif'print_av('script-languages', join("\n", sort @soif'scriptlanguages)) if (@soif'scriptlanguages && $createScriptLanguages);&soif'print_av('applets', join("\n", sort @soif'applets)) if (@soif'applets && $createApplets);print "}\n" if $url;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -