📄 mygrabber.pm
字号:
#!/usr/bin/perl -w
use strict;
#########################
# MyGrabber Package
# Yeni, 2006/11
# yeni@yueds.com
#########################
# This script is a part of UCPro project.
# For more information, please visit http://tuc.cn/pro/
# Grabbing webpages, analyzing them by Parser Tree or special tag.
#
# getPage -
# download the webpage only.
#
# getPageAndParse -
# download the webpage and then parse them by Parser Tree.
# you can define the handler to process start tag, end tag or text.
#
# getPageAndSearch -
# download the webpage and then search for substrings.
# you can define some search rules.
#
# newSearchRule -
# create a new search rule set.
#
# addSearchRule -
# add a search rule to set, a rule includes what the substring begin
# and end with.
package MyGrabber;
use LWP::UserAgent; # libwww for downloading HTTP contents
use HTML::Parser; # HTML Parser for work with HTML contents
# Global declarations
our @links; # the links list of current page
our $rulefunc; # $rulefunc - after found the string, the function called
my @searchrule; # the rules of searching strings
my $output_url = 1; # whether output the url
# Functions
sub links_start {
my ($tag, $attr, $attrseq, $origtext) = @_;
# record all links here
unless($tag =~ /^a$/) {
return;
}
if (defined $attr->{'href'}) {
push(@links, $attr->{'href'});
}
}
sub links_final {
print "$#links links detected.\n";
}
sub null_start {
my ($tag, $attr, $attrseq, $origtext) = @_;
}
sub null_text {
my ($text) = @_;
}
sub null_final { }
# get page and get parsed
sub getPageAndParse {
my ($url, $capt, $start_handler, $text_handler, $final_handler) = @_;
# $url - url address of the page
# $capt - job name - optional
# $start_handler - a function ref, called when starts a tag
# $text_handler - a function ref, called when in a tag
# $final_handler - a function ref, call when parsing succeed
# Check the outcome of the response
my $html = getPage($url, $capt);
if($html eq "") {
return "ERROR";
} else {
print "Parsing...";
#create the parser
my $p = HTML::Parser->new(api_version => 3,
start_h => [$start_handler,
"tagname, attr, attrseq, text"],
text_h => [$text_handler, "text"],
default_h => [sub { }, "text"],
);
# Clear the links list
$#links = -1;
# Parse document text
$p->utf8_mode(1);
$p->parse($html);
$p->eof;
print "ok.\n";
&$final_handler;
}
}
# clear search rule
sub newSearchRule {
$#searchrule = -1;
}
# add search rule
sub addSearchRule {
my ($rulename, $beginwith, $endwith, $max_times, $reset) = @_;
# $rulename - give a name to this rule
# $beginwith & endwith - search the strings which between this two string
# $max_times - how many times it will appear at max, 0 for unlimited
# $reset - whether this search continues the last rule search
#
# while the target string found, the $rulefunc will be called.
# the $rulefunc receives param ($target_str, $rulename)
push(@searchrule, \@_);
}
sub getPageAndSearch {
my ($url, $capt) = @_;
# Check the outcome of the response
my $html = getPage($url, $capt);
if ($html eq "") {
return "ERROR";
} else {
my $count = 0; # count operated files
my $iterator = 0; # the iterator of the html file
foreach my $rule (@searchrule) {
# read the rule details
my ($rulename, $beginwith, $endwith, $max_times, $reset) = @$rule;
print "Searching rule $count/$#searchrule $rulename...";
my $found = 0; # count replaced in this rule
$iterator = 0 if ($reset == 1); # reset the iterator
while(1) {
last if($max_times > 0 && $found >= $max_times); # limit max times
# main search
my $beginpos = index($html, $beginwith, $iterator);
last if($beginpos < 0); # not found
$beginpos = $beginpos + length($beginwith);
$iterator = $beginpos;
my $endpos = index($html, $endwith, $iterator);
if($endpos < 0) { # not found
$endpos = length($html) - 1;
$iterator = 0; # auto reset
} else { # found, set iterator
$iterator = $endpos + length($endwith);
}
# get the sub string
my $resstr = substr($html, $beginpos, ($endpos - $beginpos));
# call the processing rulefunc
&$rulefunc($resstr, $rulename);
$found++;
}
print "$found found.\n";
$count++;
}
}
}
# download page only
sub getPage {
my ($url, $capt) = @_;
# Create a user agent object
my $ua = LWP::UserAgent->new;
$ua->agent("Yenibot/1.0" . $ua->agent);
# Initialize proxy settings from environment variables
$ua->env_proxy;
# Create a request
my $req = HTTP::Request->new(GET => $url);
$req->header('Accept' => 'text/html');
if($output_url) {
print "\n--------------- $capt ---------------\n$url\nDownloading...";
} else {
print "\n--------------- $capt ---------------\nDownloading...";
}
# Pass request to the user agent and get a response back
my $res = $ua->request($req);
if ($res->is_success) {
print "ok.\n";
my $ret = $res->content;
return $ret;
} else {
print "failed.\nError: " . $res->status_line . "\n";
return "";
}
}
# crop out a string between two string
sub cropOut {
my ($expr, $beginwith, $endwith) = @_;
my $beginpos = index($expr, $beginwith, 0);
return "" if($beginpos < 0); # not found
$beginpos = $beginpos + length($beginwith);
my $endpos;
if($endwith eq '') {
$endpos = length($expr) - 1;
} else {
$endpos = index($expr, $endwith, $beginpos);
if($endpos < 0) { # not found
$endpos = length($expr) - 1;
}
}
# get the sub string
my $resstr = substr($expr, $beginpos, ($endpos - $beginpos));
return $resstr;
}
1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -