⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 start_sigmod.pl

📁 利用lwp::get写的
💻 PL
字号:
#!/usr/bin/perl -w
use strict;

#########################
# Start SIGMOD
# Yeni, 2006/11
# yeni@yueds.com
#########################
# define the crawler's rule to fetch SIGMOD papers.

use MyCrawler;

# Go to main entry
&main;
exit;

############### PERSONALIZE PART BEGIN ###############

# Main entry
sub main {
    # use sigmod rules
    sigmod_rules();
    
    $MyGrabber::rulefunc = \&sigmod_rulefunc;
    $MyCrawler::links_filter = \&sigmod_links_filter;
    
    $MyCrawler::pagebase = 'http://portal.acm.org/';
    
    # begin collecting from TOCs
    MyCrawler::toc('SIGMOD', '1994', 'http://portal.acm.org/toc.cfm?id=191839&type=proceeding&coll=Portal&dl=ACM&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '1995', 'http://portal.acm.org/toc.cfm?id=223784&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '1996', 'http://portal.acm.org/toc.cfm?id=233269&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '1997', 'http://portal.acm.org/toc.cfm?id=253260&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '1998', 'http://portal.acm.org/toc.cfm?id=276304&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '1999', 'http://portal.acm.org/toc.cfm?id=304182&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '2000', 'http://portal.acm.org/toc.cfm?id=342009&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '2001', 'http://portal.acm.org/toc.cfm?id=375663&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '2002', 'http://portal.acm.org/toc.cfm?id=564691&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '2003', 'http://portal.acm.org/toc.cfm?id=872757&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '2004', 'http://portal.acm.org/toc.cfm?id=1007568&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '2005', 'http://portal.acm.org/toc.cfm?id=1066157&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
    MyCrawler::toc('SIGMOD', '2006', 'http://portal.acm.org/toc.cfm?id=1142473&coll=Portal&dl=ACM&type=proceeding&idx=SERIES473&part=Proceedings&WantType=Proceedings&title=International%20Conference%20on%20Management%20of%20Data&CFID=6986287&CFTOKEN=31392450');
}

sub sigmod_rules {
    # add page search rules for content pages
    MyGrabber::newSearchRule;
    MyGrabber::addSearchRule('Title',   # rule name
                             '<td class="medium-text" colspan="3">',
                             '</td>',
                             1,         # appear once
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('Year',  # rule name
                             'Proceedings of the ',
                             ' ACM',
                             1,         # appear once
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('Author',  # rule name
                             '<a href="results.cfm?query=author',
                             '</tr>',
                             0,         # unlimited
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('Conference',  # rule name
                             '<SPAN class="mediumb-text">',
                             '</span>',
                             1,         # appear once
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('Abstract',  # rule name
                             '<p class="abstract">',
                             '</p>',
                             1,         # appear once
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('(LeadToReferences)',  # rule name
                             'Note: OCR errors may be found in this Reference List extracted from the full text article.  ACM has opted to expose the complete List rather than only correct and linked references.',
                             '</font></p>',
                             1,         # appear once
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('Reference',  # rule name
                             "&nbsp;&nbsp;\r\n				  \r\n",
                             '</p>',
                             0,         # unlimited
                             1,         # continued with last search
                            );
}

sub sigmod_rulefunc {
    my ($resstr, $rulename) = @_;
    if ($rulename eq 'Author') {
        my $authorname = MyGrabber::cropOut($resstr,
                                            '" target="_self">',
                                            '</a>'
                                            );
        my $institute = MyGrabber::cropOut($resstr,
                                           '<small>&nbsp;',
                                           '</small>'
                                           );
        # create a tuple for a author and save it
        my @authortuple = ($authorname, $institute);
        push(@MyCrawler::authors, \@authortuple);
    } elsif ($rulename eq 'Reference') {
        push(@MyCrawler::cits, $resstr);
    } else {
        $MyCrawler::props{$rulename} = $resstr;
    }
}

sub sigmod_links_filter {
    foreach my $url (@MyGrabber::links) {
        if($url =~ /type\=series/ && $url !~ /SERIES/) {
            push(@MyCrawler::availlinks, $url);
        }
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -