📄 baidu_tieba.pl
字号:
use strict;
use LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->cookie_jar({});
use Storable qw/thaw freeze/;
#crawl($ARGV[0]);
my $want_cat = $ARGV[0];
my $rh_cat = {
'超级经典' => 'http://tieba.baidu.com/f?ct=318767104&tn=baiduKeywordSearch&sc=24&pn=0&rn=50&lm=4&rs4=1&rs3=6&word=%C3%C0%C5%AE',
'优秀精品' => 'http://tieba.baidu.com/f?ct=318767104&tn=baiduKeywordSearch&sc=24&pn=0&rn=50&lm=4&rs4=2&rs3=3&word=%C3%C0%C5%AE',
'普通精品' => 'http://tieba.baidu.com/f?ct=318767104&tn=baiduKeywordSearch&sc=24&pn=0&rn=50&lm=4&rs4=3&rs3=4&word=%C3%C0%C5%AE',
};
for my $cat (keys %$rh_cat) {
if ($want_cat) {
next unless $want_cat eq $cat;
}
mkdir $cat;
my $url = $rh_cat->{$cat};
#get history:
my $rh_his = {};
if ( open(FILE, '<', $cat . '.dat') ) {
binmode FILE;
my $his = join '', <FILE> ;
close FILE;
$rh_his = thaw($his);
}
#get all topics:
my @topics;
my $page = $ua->get($url)->content;
while ($page =~ m{<a\s+href="?(/f\?kz=\d+)"?}gis) {
push @topics, $1;
}
my $rh_page = {};
while ($page =~ m{<a\s+href="?(/f\?[^"]*?)"?>\[(\d+)\]}gis) {
$rh_page->{$2} = $1;
}
for my $p (keys %$rh_page) {
next if $p <= 1;
my $url = 'http://tieba.baidu.com' . $rh_page->{$p};
$page = $ua->get($url)->content;
while ($page =~ m{<a\s+href="?(/f\?kz=\d+)"?}gis) {
push @topics, $1;
}
}
#print join "\n", @topics;
for my $topic (@topics) {
$| = 1;
next if $rh_his->{$topic};
crawl('http://tieba.baidu.com' . $topic, $cat );
$rh_his->{$topic} = time;
#save
if ( open(FILE, '>', $cat . '.dat') ) {
binmode FILE;
print FILE freeze($rh_his);
close FILE;
}
}
}
sub crawl {
my $url = shift;
my $cat = shift;
my $page = $ua->get($url)->content;
$page =~ m{<title>百度_(.*?)\s*</title>}i;
my $title = $1;
$title =~ s{ }{ }g;
$title =~ s{<}{<}g;
$title =~ s{>}{>}g;
print "[$cat]$title\n";
my @images =();
#共有贴子数<font color="red">185</font>篇 1 <a href="/f?">[2]</a> <a href="/f?z=447213590&ct=335544320&lm=0&sc=0&rn=50&tn=baiduPostBrowser&word=%C3%C0%C5%AE&pn=100">[3]</a> <a href="/f?z=447213590&ct=335544320&lm=0&sc=0&rn=50&tn=baiduPostBrowser&word=%C3%C0%C5%AE&pn=150">[4]</a> <a href="/f?z=447213590&ct=335544320&lm=0&sc=0&rn=50&tn=baiduPostBrowser&word=%C3%C0%C5%AE&pn=50"><font>下一页</font></a> <a href="/f?z=447213590&ct=335544320&lm=0&sc=0&rn=50&tn=baiduPostBrowser&word=%C3%C0%C5%AE&pn=150"><font>尾页</font></a>
#get all pages
#print $page;
my $rh_page = {};
while ($page =~ m{<a\s+href="?(/f\?[^"]*?)"?>\[(\d+)\]}gis) {
$rh_page->{$2} = $1;
}
while ($page =~ m{src="(http://hiphotos\.baidu\.com/[^"]*?\.jpg)"}isg) {
my $image = $1;
push @images, $image;
}
for my $p (keys %$rh_page) {
next if $p <= 1;
my $url = 'http://tieba.baidu.com' . $rh_page->{$p};
$page = $ua->get($url)->content;
while ($page =~ m{src="(http://hiphotos\.baidu\.com/[^"]*?\.jpg)"}isg) {
my $image = $1;
push @images, $image;
}
}
return unless @images;
mkdir $cat . '/' . $title;
for my $url (@images) {
my $image = $ua->get($url)->content;
$url =~ m{([^/]*\.jpg)$};
open(FILE, '>', "$cat/$title/$1");
binmode FILE;
print FILE $image;
close FILE;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -