📄 start_sigmod.pl
字号:
#!/usr/bin/perl -w
use strict;
#########################
# Start SIGMOD
# Yeni, 2006/11
# yeni@yueds.com
#########################
# define the crawler's rule to fetch SIGMOD papers.
use MyCrawler;
# Go to main entry
&main;
exit;
############### PERSONALIZE PART BEGIN ###############
# Main entry
sub main {
# use sigmod rules
sigmod_rules();
$MyGrabber::rulefunc = \&sigmod_rulefunc;
$MyCrawler::links_filter = \&sigmod_links_filter;
$MyCrawler::pagebase = 'http://portal.acm.org/';
# begin collecting from TOCs
MyCrawler::toc('SIGMOD', '1994', 'http://portal.acm.org/toc.cfm?id=191839&type=proceeding&coll=Portal&dl=ACM&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '1995', 'http://portal.acm.org/toc.cfm?id=223784&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '1996', 'http://portal.acm.org/toc.cfm?id=233269&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '1997', 'http://portal.acm.org/toc.cfm?id=253260&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '1998', 'http://portal.acm.org/toc.cfm?id=276304&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '1999', 'http://portal.acm.org/toc.cfm?id=304182&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '2000', 'http://portal.acm.org/toc.cfm?id=342009&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '2001', 'http://portal.acm.org/toc.cfm?id=375663&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '2002', 'http://portal.acm.org/toc.cfm?id=564691&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '2003', 'http://portal.acm.org/toc.cfm?id=872757&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '2004', 'http://portal.acm.org/toc.cfm?id=1007568&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '2005', 'http://portal.acm.org/toc.cfm?id=1066157&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
MyCrawler::toc('SIGMOD', '2006', 'http://portal.acm.org/toc.cfm?id=1142473&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
}
sub sigmod_rules {
# add page search rules for content pages
MyGrabber::newSearchRule;
MyGrabber::addSearchRule('Title', # rule name
'<td class="medium-text" colspan="3">',
'</td>',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Year', # rule name
'Proceedings of the ',
' ACM',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Author', # rule name
'<a href="results.cfm?query=author',
'</tr>',
0, # unlimited
1, # continued with last search
);
MyGrabber::addSearchRule('Conference', # rule name
'<SPAN class="mediumb-text">',
'</span>',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Abstract', # rule name
'<p class="abstract">',
'</p>',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('(LeadToReferences)', # rule name
'Note: OCR errors may be found in this Reference List extracted from the full text article. ACM has opted to expose the complete List rather than only correct and linked references.',
'</font></p>',
1, # appear once
1, # continued with last search
);
MyGrabber::addSearchRule('Reference', # rule name
" \r\n \r\n",
'</p>',
0, # unlimited
1, # continued with last search
);
}
sub sigmod_rulefunc {
my ($resstr, $rulename) = @_;
if ($rulename eq 'Author') {
my $authorname = MyGrabber::cropOut($resstr,
'" target="_self">',
'</a>'
);
my $institute = MyGrabber::cropOut($resstr,
'<small> ',
'</small>'
);
# create a tuple for a author and save it
my @authortuple = ($authorname, $institute);
push(@MyCrawler::authors, \@authortuple);
} elsif ($rulename eq 'Reference') {
push(@MyCrawler::cits, $resstr);
} else {
$MyCrawler::props{$rulename} = $resstr;
}
}
sub sigmod_links_filter {
foreach my $url (@MyGrabber::links) {
if($url =~ /type\=series/ && $url !~ /SERIES/) {
push(@MyCrawler::availlinks, $url);
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -