📄 start_vldb_before_2000.pl
字号:
#!/usr/bin/perl -w
use strict;
#########################
# Start VLDB -2000
# Yeni, 2006/11
# yeni@yueds.com
#########################
# define the crawler's rule to fetch VLDB papers year before 2000.
use MyCrawler;
# Go to main entry
&main;
exit;
############### PERSONALIZE PART BEGIN ###############
# Main entry
sub main {
# use sigmod rules
sigmod_rules();
$MyGrabber::rulefunc = \&sigmod_rulefunc;
$MyCrawler::links_filter = \&sigmod_links_filter;
$MyCrawler::pagebase = 'http://www.vldb.org/dblp/db/conf/vldb/';
# begin collecting from TOCs
MyCrawler::toc('VLDB', '94', 'http://www.vldb.org/dblp/db/conf/vldb/vldb94.html');
MyCrawler::toc('VLDB', '95', 'http://www.vldb.org/dblp/db/conf/vldb/vldb95.html');
MyCrawler::toc('VLDB', '96', 'http://www.vldb.org/dblp/db/conf/vldb/vldb96.html');
MyCrawler::toc('VLDB', '97', 'http://www.vldb.org/dblp/db/conf/vldb/vldb97.html');
MyCrawler::toc('VLDB', '98', 'http://www.vldb.org/dblp/db/conf/vldb/vldb98.html');
MyCrawler::toc('VLDB', '99', 'http://www.vldb.org/dblp/db/conf/vldb/vldb99.html');
MyCrawler::toc('VLDB', '00', 'http://www.vldb.org/dblp/db/conf/vldb/vldb2000.html');
}
sub sigmod_rules {
# add page search rules for content pages
MyGrabber::newSearchRule;
MyGrabber::addSearchRule('Title', # rule name
'<h1>',
'</h1>',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Author', # rule name
' author = {',
'},',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Year', # rule name
' year = {',
'},',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Abstract', # rule name
'<h2>Abstract</h2>',
'<p><small><i>Copyright',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('(LeadToReferences)', # rule name
'<h2>References</h2>',
'<dl>',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Reference', # rule name
'<dd>',
'<font size="-3">',
0, # unlimited
1, # continued with last search
);
}
sub sigmod_rulefunc {
my ($resstr, $rulename) = @_;
if ($rulename eq 'Author') {
my @authors = split(/ and\n/, $resstr);
foreach my $authorname (@authors) {
my $institute = '';
my @authortuple = ($authorname, $institute);
push(@MyCrawler::authors, \@authortuple);
}
} elsif ($rulename eq 'Reference') {
my $strout = MyGrabber::cropOut($resstr,
":\n",
'<a href='
);
push(@MyCrawler::cits, $strout);
} else {
$MyCrawler::props{$rulename} = $resstr;
}
$MyCrawler::props{'Conference'} = 'International Conference on Very Large Data Bases';
}
sub sigmod_links_filter {
foreach my $url (@MyGrabber::links) {
if($url =~ /$MyCrawler::current_year/ && $url =~ /\.html/ && $url !~ /http\:\/\//) {
push(@MyCrawler::availlinks, $url);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -