📄 mygrabber.pm

📁 利用lwp：：get写的
💻 PM
字号:
#!/usr/bin/perl -w
use strict;

#########################
# MyGrabber Package
# Yeni, 2006/11
# yeni@yueds.com
#########################
# This script is a part of UCPro project.
# For more information, please visit http://tuc.cn/pro/

# Grabbing webpages, analyzing them by Parser Tree or special tag.
#
# getPage -
#   download the webpage only.
#
# getPageAndParse -
#   download the webpage and then parse them by Parser Tree.
#   you can define the handler to process start tag, end tag or text.
#
# getPageAndSearch -
#   download the webpage and then search for substrings.
#   you can define some search rules.
#
# newSearchRule -
#   create a new search rule set.
#
# addSearchRule -
#   add a search rule to set, a rule includes what the substring begin
#   and end with.

package MyGrabber;

use LWP::UserAgent; # libwww for downloading HTTP contents
use HTML::Parser;   # HTML Parser for work with HTML contents

# Global declarations
our @links;     # the links list of current page
our $rulefunc;  # $rulefunc - after found the string, the function called

my @searchrule; # the rules of searching strings
my $output_url = 1; # whether output the url

# Functions
sub links_start {
    my ($tag, $attr, $attrseq, $origtext) = @_;
    # record all links here
    
    unless($tag =~ /^a$/) {
        return;
    }
    
    if (defined $attr->{'href'}) {
        push(@links, $attr->{'href'});
    }
}

sub links_final {
    print "$#links links detected.\n";
}

sub null_start {
    my ($tag, $attr, $attrseq, $origtext) = @_;
}

sub null_text {
    my ($text) = @_;
}

sub null_final { }

# get page and get parsed
sub getPageAndParse {
    my ($url, $capt, $start_handler, $text_handler, $final_handler) = @_;
    # $url - url address of the page
    # $capt - job name - optional
    # $start_handler - a function ref, called when starts a tag
    # $text_handler - a function ref, called when in a tag
    # $final_handler - a function ref, call when parsing succeed
    
    # Check the outcome of the response
    my $html = getPage($url, $capt);
    if($html eq "") {
        return "ERROR";
    } else {
        print "Parsing...";
        #create the parser
        my $p = HTML::Parser->new(api_version  => 3,
                                  start_h       => [$start_handler,
                                                  "tagname, attr, attrseq, text"],
                                  text_h        => [$text_handler, "text"],
                                  default_h     => [sub {  }, "text"],
                                  );
        
        # Clear the links list
        $#links = -1;
        
        # Parse document text
        $p->utf8_mode(1);
        $p->parse($html);
        $p->eof;
        
        print "ok.\n";
        &$final_handler;
    }
}

# clear search rule
sub newSearchRule {
    $#searchrule = -1;
}

# add search rule
sub addSearchRule {
    my ($rulename, $beginwith, $endwith, $max_times, $reset) = @_;
    # $rulename - give a name to this rule
    # $beginwith & endwith - search the strings which between this two string
    # $max_times - how many times it will appear at max, 0 for unlimited
    # $reset - whether this search continues the last rule search
    #
    # while the target string found, the $rulefunc will be called.
    # the $rulefunc receives param ($target_str, $rulename)
    push(@searchrule, \@_);
}

sub getPageAndSearch {
    my ($url, $capt) = @_;

    # Check the outcome of the response
    my $html = getPage($url, $capt);
    if ($html eq "") {
        return "ERROR";
    } else {
        my $count = 0;  # count operated files
        my $iterator = 0;   # the iterator of the html file
        
        foreach my $rule (@searchrule) {
            # read the rule details
            my ($rulename, $beginwith, $endwith, $max_times, $reset) = @$rule;
            
            print "Searching rule $count/$#searchrule $rulename...";
            
            my $found = 0;  # count replaced in this rule
            
            $iterator = 0 if ($reset == 1); # reset the iterator
            
            while(1) {
                last if($max_times > 0 && $found >= $max_times);    # limit max times
                # main search
                my $beginpos = index($html, $beginwith, $iterator);
                
                last if($beginpos < 0); # not found
                $beginpos = $beginpos + length($beginwith);
                $iterator = $beginpos;
                
                my $endpos = index($html, $endwith, $iterator);
                if($endpos < 0) {       # not found
                    $endpos = length($html) - 1;
                    $iterator = 0;      # auto reset
                } else {    # found, set iterator
                    $iterator = $endpos + length($endwith);
                }
                
                # get the sub string
                my $resstr = substr($html, $beginpos, ($endpos - $beginpos));
                
                # call the processing rulefunc
                &$rulefunc($resstr, $rulename);
                
                $found++;
            }
            
            print "$found found.\n";
            $count++;
        }
    }
}

# download page only
sub getPage {
    my ($url, $capt) = @_;

    # Create a user agent object
    my $ua = LWP::UserAgent->new;
    $ua->agent("Yenibot/1.0" . $ua->agent);
    
    # Initialize proxy settings from environment variables
    $ua->env_proxy;
    
    # Create a request
    my $req = HTTP::Request->new(GET => $url);
    $req->header('Accept' => 'text/html');
    
    if($output_url) {
        print "\n--------------- $capt ---------------\n$url\nDownloading...";
    } else {
        print "\n--------------- $capt ---------------\nDownloading...";
    }
    
    # Pass request to the user agent and get a response back
    my $res = $ua->request($req);
    if ($res->is_success) {
        print "ok.\n";
        my $ret = $res->content;
        return $ret;
    } else {
        print "failed.\nError: " . $res->status_line . "\n";
        return "";
    }
}

# crop out a string between two string
sub cropOut {
    my ($expr, $beginwith, $endwith) = @_;
    
    my $beginpos = index($expr, $beginwith, 0);
    
    return "" if($beginpos < 0); # not found
    $beginpos = $beginpos + length($beginwith);

    my $endpos;
    if($endwith eq '') {
        $endpos = length($expr) - 1;
    } else {
        $endpos = index($expr, $endwith, $beginpos);
        if($endpos < 0) {       # not found
            $endpos = length($expr) - 1;
        }
    }
    
    # get the sub string
    my $resstr = substr($expr, $beginpos, ($endpos - $beginpos));
    return $resstr;
}

1;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -