⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 start_icde.pl

📁 利用lwp::get写的
💻 PL
字号:
#!/usr/bin/perl -w
use strict;

#########################
# Start ICDE
# Yeni, 2006/11
# yeni@yueds.com
#########################
# define the crawler's rule to fetch ICDE papers.

use MyCrawler;

# Go to main entry
&main;
exit;

############### PERSONALIZE PART BEGIN ###############

# Main entry
sub main {
    # use icde rules
    icde_rules();
    
    $MyGrabber::rulefunc = \&icde_rulefunc;
    $MyCrawler::links_filter = \&icde_links_filter;
    
    $MyCrawler::pagebase = 'http://csdl2.computer.org/persagen/';
    
    # begin collecting from TOCs
    MyCrawler::toc('ICDE', '1995', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/1995/6910/00/6910toc.xml');
    MyCrawler::toc('ICDE', '1996', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/1996/7240/00/7240toc.xml');
    MyCrawler::toc('ICDE', '1997', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/1997/7807/00/7807toc.xml');
    MyCrawler::toc('ICDE', '1998', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/1998/8289/00/8289toc.xml');
    MyCrawler::toc('ICDE', '1999', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/1999/0071/00/0071toc.xml');
    MyCrawler::toc('ICDE', '2000', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/2000/0506/00/0506toc.xml');
    MyCrawler::toc('ICDE', '2001', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/2001/1001/00/1001toc.xml');
    MyCrawler::toc('ICDE', '2002', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/2002/1531/00/1531toc.xml');
    MyCrawler::toc('ICDE', '2003', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/2003/2071/00/2071toc.xml');
    MyCrawler::toc('ICDE', '2004', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/2003/2071/00/2071toc.xml');
    MyCrawler::toc('ICDE', '2005', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/2005/2285/00/2285toc.xml');
    MyCrawler::toc('ICDE', '2006', 'http://csdl2.computer.org/persagen/DLAbsToc.jsp?resourcePath=/dl/proceedings/&toc=comp/proceedings/icde/2006/2570/00/2570toc.xml');
}

sub icde_rules {
    # add page search rules for content pages
    MyGrabber::newSearchRule;
    MyGrabber::addSearchRule('Title',   # rule name
                             '<meta name="DC.title" content="',
                             '"/>',
                             1,         # appear once
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('Abstract',  # rule name
                             '<meta name="DC.description" content="',
                             '"/>',
                             1,         # appear once
                             1,         # continued with last search
                            );
    MyGrabber::addSearchRule('Author',  # rule name
                             '<a href="http://search2.computer.org/advanced/Author_Result.jsp?qtype=3&amp;select=50&amp;qOpt1=DC_CREATOR&amp;sortOrder=d&amp;queryName=',
                             '<br/>',
                             0,         # unlimited
                             1,         # continued with last search
                            );
}

sub icde_rulefunc {
    my ($resstr, $rulename) = @_;
    if ($rulename eq 'Author') {
        my $authorname = MyGrabber::cropOut($resstr,
                                            '">',
                                            '</a>'
                                            );
        my $institute = MyGrabber::cropOut($resstr,
                                           ', ',
                                           ''
                                           );
        # create a tuple for a author and save it
        my @authortuple = ($authorname, $institute);
        push(@MyCrawler::authors, \@authortuple);
    } else {
        $MyCrawler::props{$rulename} = $resstr;
    }
    $MyCrawler::props{'Conference'} = 'International Conference on Data Engineering';
    $MyCrawler::props{'Year'} = $MyCrawler::current_year;
}

sub icde_links_filter {
    my $trigger = 0;
    foreach my $url (@MyGrabber::links) {
        if($url =~ /DLAbsToc\.jsp/ && $url =~ /DOI\=/) {
            if($trigger) {
                push(@MyCrawler::availlinks, $url);
            }
            $trigger = !$trigger;
        }
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -