📄 baidu_tieba.pl

📁 抓取百度美女吧的图片, 存成本地. 并且支持断点继续下载
💻 PL
字号:
use strict;

use LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->cookie_jar({});
use Storable qw/thaw freeze/;
#crawl($ARGV[0]);
my $want_cat = $ARGV[0];
my $rh_cat = {
    '超级经典' => 'http://tieba.baidu.com/f?ct=318767104&tn=baiduKeywordSearch&sc=24&pn=0&rn=50&lm=4&rs4=1&rs3=6&word=%C3%C0%C5%AE',
    '优秀精品' => 'http://tieba.baidu.com/f?ct=318767104&tn=baiduKeywordSearch&sc=24&pn=0&rn=50&lm=4&rs4=2&rs3=3&word=%C3%C0%C5%AE',
    '普通精品' => 'http://tieba.baidu.com/f?ct=318767104&tn=baiduKeywordSearch&sc=24&pn=0&rn=50&lm=4&rs4=3&rs3=4&word=%C3%C0%C5%AE',
    
};




for my $cat (keys %$rh_cat) {
    if ($want_cat) {
        next unless $want_cat eq $cat;
    }
    mkdir $cat;
    my $url = $rh_cat->{$cat};
    #get history:
    my $rh_his = {};
    if ( open(FILE, '<', $cat . '.dat') ) {
        binmode FILE;
        my $his = join '', <FILE> ;
        close FILE;
         $rh_his = thaw($his);
    }
    

    
    #get all topics:
    my @topics;
    my $page = $ua->get($url)->content;
    while ($page =~ m{<a\s+href="?(/f\?kz=\d+)"?}gis) {
        push @topics, $1;
    }
    
    my $rh_page = {};
    while ($page =~ m{<a\s+href="?(/f\?[^"]*?)"?>\[(\d+)\]}gis) {
        $rh_page->{$2} = $1;
    }
    
    for my $p (keys %$rh_page) {
        next if $p <= 1;
        my $url = 'http://tieba.baidu.com' . $rh_page->{$p};
        $page = $ua->get($url)->content;
        while ($page =~ m{<a\s+href="?(/f\?kz=\d+)"?}gis) {
            push @topics, $1;
        }
    }
    
    #print join "\n", @topics;
    
    for my $topic (@topics) {
        $| = 1;
        next if $rh_his->{$topic};
        crawl('http://tieba.baidu.com' . $topic,  $cat );
        $rh_his->{$topic} = time;
        
        #save
        if ( open(FILE, '>', $cat . '.dat') ) {
            binmode FILE;
            print FILE freeze($rh_his);
            close FILE;
        }
        
    }
}



sub crawl {
    my $url = shift;
    my $cat = shift;
    my $page = $ua->get($url)->content;
    $page =~ m{<title>百度_(.*?)\s*</title>}i;
    my $title = $1;
    
    $title =~ s{&nbsp;}{ }g;
    $title =~ s{&lt;}{<}g;
    $title =~ s{&gt;}{>}g;
    
            print "[$cat]$title\n";

    my @images =();
 
 
 
 #共有贴子数<font color="red">185</font>篇 &nbsp;&nbsp;1 <a href="/f?">[2]</a>&nbsp;<a href="/f?z=447213590&amp;ct=335544320&amp;lm=0&amp;sc=0&amp;rn=50&amp;tn=baiduPostBrowser&amp;word=%C3%C0%C5%AE&amp;pn=100">[3]</a>&nbsp;<a href="/f?z=447213590&amp;ct=335544320&amp;lm=0&amp;sc=0&amp;rn=50&amp;tn=baiduPostBrowser&amp;word=%C3%C0%C5%AE&amp;pn=150">[4]</a>&nbsp;<a href="/f?z=447213590&amp;ct=335544320&amp;lm=0&amp;sc=0&amp;rn=50&amp;tn=baiduPostBrowser&amp;word=%C3%C0%C5%AE&amp;pn=50"><font>下一页</font></a>&nbsp;<a href="/f?z=447213590&amp;ct=335544320&amp;lm=0&amp;sc=0&amp;rn=50&amp;tn=baiduPostBrowser&amp;word=%C3%C0%C5%AE&amp;pn=150"><font>尾页</font></a>&nbsp;   
    #get all pages
    #print $page;
    my $rh_page = {};
    while ($page =~ m{<a\s+href="?(/f\?[^"]*?)"?>\[(\d+)\]}gis) {
        $rh_page->{$2} = $1;
    }
  
    
    
 
    while ($page =~ m{src="(http://hiphotos\.baidu\.com/[^"]*?\.jpg)"}isg) {
        my $image = $1;
        push @images, $image;
    }
    
    for my $p (keys %$rh_page) {
        next if $p <= 1;
        my $url = 'http://tieba.baidu.com' . $rh_page->{$p};
        $page = $ua->get($url)->content;
        while ($page =~ m{src="(http://hiphotos\.baidu\.com/[^"]*?\.jpg)"}isg) {
            my $image = $1;
            push @images, $image;
        }
    }
    
  
    return unless @images;
    mkdir  $cat . '/' . $title;
    for my $url (@images) {
        my $image = $ua->get($url)->content;
        $url =~ m{([^/]*\.jpg)$};
        open(FILE, '>', "$cat/$title/$1");
        binmode FILE;
        print FILE $image;
        close FILE;
    }
    

    
}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -