📄 mycrawler.pm
字号:
#!/usr/bin/perl -w
use strict;
#########################
# MyCrawler Package
# Yeni, 2006/11
# yeni@yueds.com
#########################
# This script is a part of UCPro project.
# For more information, please visit http://tuc.cn/pro/
# the service package for a paper crawler with XML output.
# you need to define rules to let it work.
package MyCrawler;
use MyGrabber; # my html input package
use MyXMLWriter; # my xml output package
# Declarations
### for each page, we must get the following properties
our @authors; # Authors
our @cits; # citations
our %props; # other string properties
###
our @availlinks; # Available links
our $pagebase; # $pagebase - the base url for all pages other than the start_url
our $links_filter; # $links_filter - a func ref, filtering from @MyGrabber::links to @availlinks
our $current_conf;
our $current_year;
############### DO NOT EDIT BELOW ###############
# toc
# download the TOC file, then dig in.
# $confname - the name of the conference
# $year - which year
# $toc_url - the url of the TOC
sub toc {
my ($confname, $year, $toc_url) = @_;
$current_conf = $confname;
$current_year = $year;
MyXMLWriter::open("${confname}_${year}_elementary.txt");
MyXMLWriter::start('proceedings');
# get page and parse with default 'link parser'
my $ret = MyGrabber::getPageAndParse($toc_url, "TOC OF '$confname - $year'",
\&MyGrabber::links_start,
\&MyGrabber::null_text,
\&MyGrabber::links_final
);
if ($ret eq "ERROR") {
return;
}
$#availlinks = -1; # clear the list
# filtering the links with what we really need.
&$links_filter;
print "$#availlinks links valid.\n";
# follow each link filtered,
# and process each page with the defined search rules.
my $count = 0;
foreach my $url (@availlinks) {
# clear the properties
%props = ();
$#authors = -1;
$#cits = -1;
#grab and search for it
$ret = MyGrabber::getPageAndSearch($pagebase.$url, "Page $count / $#availlinks");
$count++;
if ($ret eq "ERROR") {
next;
}
#xml output
xmlout();
}
MyXMLWriter::end('proceedings');
MyXMLWriter::close;
}
# xmlout
# output as specified XML format
sub xmlout {
# output format:
#<paper>
#<title>璁烘枃棰樼洰</title>
#<author>
#<AuthorName>浣滆
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -