⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 html-sum.pl

📁 harvest是一个下载html网页得机器人
💻 PL
字号:
: # *-*-perl-*-*    eval 'exec perl -S $0 "$@"'    if $running_under_some_shell;## HTML-sum.pl## $Id: HTML-sum.pl,v 2.2 2000/02/03 12:45:55 sxw Exp $## andy powell <a.powell@ukoln.ac.uk>################################################################################  Harvest Indexer http://harvest.sourceforge.net/#  -----------------------------------------------##  The Harvest Indexer is a continued development of code developed by#  the Harvest Project. Development is carried out by numerous individuals#  in the Internet community, and is not officially connected with the#  original Harvest Project or its funding sources.##  Please mail lee@arco.de if you are interested in participating#  in the development effort.##  This program is free software; you can redistribute it and/or modify#  it under the terms of the GNU General Public License as published by#  the Free Software Foundation; either version 2 of the License, or#  (at your option) any later version.##  This program is distributed in the hope that it will be useful,#  but WITHOUT ANY WARRANTY; without even the implied warranty of#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the#  GNU General Public License for more details.##  You should have received a copy of the GNU General Public License#  along with this program; if not, write to the Free Software#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.############################################################################ Parse HTML file to generate SOIF record on STDOUT## Usage: HTML-sum.pl [-u URL] filename## Giving the -u URL option causes a complete SOIF record to be generated# incluing an opening## @FILE { URL## and a closing## }## Otherwise a simple list of SOIF attributes is generated which is what# Harvest expects from HTML-sum.pl.$ENV{'HARVEST_HOME'} = "/usr/local/harvest" if (!defined($ENV{'HARVEST_HOME'}));unshift(@INC, "$ENV{'HARVEST_HOME'}/lib");      # use local filesrequire "soif.pl";require HTML::Parser;use Getopt::Std;use HTML::Entities ();package P; @ISA = qw(HTML::Parser);# Change these as desired...#$createURLTexts = 1; # Create SOIF 'url-texts' attribute to match                     # 'url-references'$createComments = 1; # Create SOIF 'comments' attribute from HTML comments$createScriptLanguages = 1; # Create SOIF 'script-languages' attribute to                     # indicate scripting languages used on page$createApplets = 1;  # Create SOIF 'applets' attribute from HTML applet tags$splitFullText = 1;  # Add blank lines to SOIF 'full-text' attribute on some                     # tags - may help with later text processing of SOIF                     # record?@paraTags = (        # List of tags that cause a paragraph break...    'title',    'h1',    'h2',    'h3',    'h4',    'h5',    'h6',    'p',    'ul',    'ol',    'dl',    'pre',    'div',    'center',    'blockquote',    'form',    'hr',    'table',);@keytags = (         # List of tags that go into SOIF keywords...    'b',    'i',    'a',    'em',    'h1',    'h2',    'h3',    'h4',    'h5',    'h6',);Getopt::Std::getopts('u:');$url = $Getopt::Std::opt_u;die "Usage: HTML-sum.pl [-u URL] filename" unless ($file = shift(@ARGV));# Called for each opening HTML tag#sub start{    my($self, $tag, $attr) = @_;    my($name, $content, $href, $code, $alt, $src);    if ($tag eq 'title') {	$titletext++;    }    if ($tag =~ /^h[1-6]$/) {	$headingstext++;    }    if (grep(/^$tag$/, @keytags)) {	$keywordstext++;    }    if ($tag eq 'meta') {	for (keys %$attr) {	    $name = $attr->{$_} if (/^name$/i);	    $content = $attr->{$_} if (/^content$/i);	}	$name =~ tr/A-Z/a-z/; # ignore case of name	if ($name eq 'keywords') { # Treat this specially - we'll use it				   # later to over-ride generated value	    push(@metakeywords, $content);	}	else {	    push(@metaname, $name) unless $meta{$name};	    $meta{$name} .= "$content\n";	}    }    if ($tag eq 'a') {	for (keys %$attr) {	    $href = $attr->{$_} if (/^href$/i);	}	push(@soif'urlreferences, $href);	$urltext++;    }    if ($tag eq 'script') {	for (keys %$attr) {	    $lang = $attr->{$_} if (/^language$/i);	}	$soif'scriptlanguages{$lang}++;	$inscript++;    }    if (grep(/^$tag$/, @paraTags)) {	push(@soif'fulltext, "\n") if $splitFullText;    }    if ($tag eq 'applet') {	for (keys %$attr) {	    $code = $attr->{$_} if (/^code$/i);	}	push(@soif'applets, $code);    }    if ($tag eq 'img') {	for (keys %$attr) {	    $alt = $attr->{$_} if (/^alt$/i);	    $src = $attr->{$_} if (/^src$/i);	}	push(@soif'fulltext, "\n\n$alt\n\n");	push(@soif'urlreferences, $src);	push(@soif'urltexts, "$alt\n");    }}# Called for each closing HTML tag#sub end{    my($self, $tag) = @_;    if ($tag eq 'title') {	$titletext--;	undef $titletext if ($titletext < 0);    }    if ($tag =~ /^h[1-6]$/) {	$headingstext--;	undef $headingstext if ($headingstext < 0);    }    if (grep(/^$tag$/, @keytags)) {	$keywordstext--;	undef $keywordstext if ($keywordstext < 0);    }    if ($tag eq 'a') {	push(@soif'urltexts, "\n");	$urltext--;	undef $urltext if ($urltext < 0);    }    if ($tag eq 'script') {	$inscript--;	undef $inscript if ($inscript < 0);    }    if (grep(/^$tag$/, @paraTags)) {	push(@soif'fulltext, "\n") if $splitFullText;    }}# Called for each bit of text#sub text{    my $self = shift;    my $t = $_[0];    return if $inscript; # Chuck out any script text    # Strip leading newline    $t =~ s/^\n+//;#    $t =~ s/\s+/ /g;    # Multiple spaces or tabs to single space...    $t =~ s/[ \t]+/ /g;    # Multiple newlines to single newline...    $t =~ s/\n+/\n/g;    # Throw out whitespace only lines    return if ($t =~ /^\s*$/);    $t = HTML::Entities::decode_entities($t);    if ($t) {	push(@soif'fulltext, $t);	# Remove leading and trailing whitespace...	$t =~ s/^\s*//;	$t =~ s/\s*$//;	push(@soif'title, $t) if $titletext;	push(@soif'headings, $t) if $headingstext;	$soif'keywords{$t}++ if $keywordstext;	push(@soif'urltexts, $t) if $urltext;    }}# Called for each HTML comment#sub comment{    my $self = shift;    return if $inscript; # Chuck out any script text    push(@soif'comments, $_[0]);}# Parse the HTML file#$p = new P;$p->parse_file($file);# Generate SOIF attribute/values#print "\@FILE { $url\n" if $url;&soif'print_av('title', join(" ", @soif'title)) if @soif'title;&soif'print_av('headings', join("\n", @soif'headings)) if @soif'headings;foreach (keys %soif'keywords) {    push(@soif'keywords, $_);}@soif'keywords = @metakeywords if (@metakeywords);&soif'print_av('keywords', join("\n", sort @soif'keywords)) if @soif'keywords;if (@soif'fulltext) {    $tmp = join("\n", @soif'fulltext);    $tmp =~ s/\n\n+/\n\n/g;    $tmp =~ s/^\s*//;    $tmp =~ s/\s*$//;    &soif'print_av('full-text', $tmp);}&soif'print_av('url-references', join("\n", @soif'urlreferences)) if @soif'urlreferences;if (@soif'urltexts && $createURLTexts) {    $tmp = join("", @soif'urltexts);    $tmp =~ s/^\s*//;    $tmp =~ s/\s*$//;    &soif'print_av('url-texts', $tmp);}foreach (@metaname) {    $meta{$_} =~ s/\s+$//;    &soif'print_av($_, $meta{$_});}&soif'print_av('comments', join("\n", @soif'comments)) if (@soif'comments && $createComments);foreach (keys %soif'scriptlanguages) {    push(@soif'scriptlanguages, $_);}&soif'print_av('script-languages', join("\n", sort @soif'scriptlanguages)) if (@soif'scriptlanguages && $createScriptLanguages);&soif'print_av('applets', join("\n", sort @soif'applets)) if (@soif'applets && $createApplets);print "}\n" if $url;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -