📄 mycrawler.pm

📁 利用lwp：：get写的
💻 PM
字号:
#!/usr/bin/perl -w
use strict;

#########################
# MyCrawler Package
# Yeni, 2006/11
# yeni@yueds.com
#########################
# This script is a part of UCPro project.
# For more information, please visit http://tuc.cn/pro/

# the service package for a paper crawler with XML output.
# you need to define rules to let it work.

package MyCrawler;

use MyGrabber;      # my html input package
use MyXMLWriter;    # my xml output package

# Declarations
### for each page, we must get the following properties
our @authors;       # Authors
our @cits;          # citations
our %props;         # other string properties
###
our @availlinks;    # Available links
our $pagebase;      # $pagebase - the base url for all pages other than the start_url
our $links_filter;  # $links_filter - a func ref, filtering from @MyGrabber::links to @availlinks

our $current_conf;
our $current_year;

############### DO NOT EDIT BELOW ###############

# toc
# download the TOC file, then dig in.
# $confname - the name of the conference
# $year - which year
# $toc_url - the url of the TOC

sub toc {
    my ($confname, $year, $toc_url) = @_;
    $current_conf = $confname;
    $current_year = $year;
    MyXMLWriter::open("${confname}_${year}_elementary.txt");
    MyXMLWriter::start('proceedings');
    
    # get page and parse with default 'link parser'
    my $ret = MyGrabber::getPageAndParse($toc_url, "TOC OF '$confname - $year'",
                                \&MyGrabber::links_start,
                                \&MyGrabber::null_text,
                                \&MyGrabber::links_final
                                );
    
    if ($ret eq "ERROR") {
        return;
    }
    
    $#availlinks = -1;  # clear the list
    
    # filtering the links with what we really need.
    &$links_filter;
    print "$#availlinks links valid.\n";

    # follow each link filtered,
    # and process each page with the defined search rules.
    my $count = 0;
    foreach my $url (@availlinks) {
        # clear the properties
        %props = ();
        $#authors = -1;
        $#cits = -1;
        
        #grab and search for it
        
        $ret = MyGrabber::getPageAndSearch($pagebase.$url, "Page $count / $#availlinks");
        $count++;
        if ($ret eq "ERROR") {
            next;
        }
        #xml output
        xmlout();
    }
    MyXMLWriter::end('proceedings');
    MyXMLWriter::close;
}

# xmlout
# output as specified XML format
sub xmlout {
    # output format:
    #<paper>
    #<title>璁烘枃棰樼洰</title>
    #<author>
    #<AuthorName>浣滆
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -