📄 news.china.com.contentextract_009.pl
字号:
#!/usr/bin/perl
#----------------------------------------------------------------------------------
#SiteUrl:http://news.china.com/
#Editor:tianshunjun
#EditDate:2009-4-2
#IncludeClass:http://travel.china.com/
#EditRemark:
#
#CheckMan: Gao Jianhai
#CheckDate: 2009-4-15
#CheckRemark:
#
#SubmitMan: Gao Jianhai
#SubmitDate: 2009-4-15
#SubmitRemark:
#
#ModifyMan:
#ModifyDate:
#ModifyRemark:
#----------------------------------------------------------------------------------
# extract reviews for pages at news.cqnews.net
# this subroutine is different from site to site
sub ContentExtract {
my $content = shift;
my %results; # a hash of {story|reviews},
# while $results{story} is a hash of {url|title|date\body|author|email|location|...}
# $results{reviews} is array of hashes: {url|title|date\body|author|email|location|...}
use Date::Parse;
## extract news story
if ($content =~ /<div id=\"chan_newsTitle\">(.*?)<td class=\"pageNum\">/isg)
{
my $story_block = $&;
# title
if ($story_block =~ /<div id=\"chan_newsTitle\">(.*?)</div>/s)
{
$results{story}{title} = Trim($1);
# print $results{story}{title};
}
# author
# if ($story_block =~ /<\/p><p class=mtext><\/td><\/tr>(.*?)<\/td><\/tr><TR><td width=590 bgcolor=\#f2f2f2>/s)
# {
# $results{story}{author} = SubTrim($1);
# }
#Time =~ /(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})
if ($story_block =~ /(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})/is)
{
$results{story}{Date} = TimeTrim($1);
# print ${Date};
}
# # date
# if ($story_block =~ /<center><font color=red>(.*?)<\/font><\/center>/s)
# {
# $results{story}{date} = Trim($1);
# }
# story body
if ($story_block =~ /<div id=\"chan_newsDetail\">(.*?)<style type=text\/css>/is)
{
my $Temp = $1;
if ($Temp =~ /<div id=\"otherHotNews\">(.*?)<\/div>/s )
{
$Temp = $'; #'
};
$results{story}{body} = Trim($Temp);
# print $results{body};
}
# # Editor
if ($story_block =~ /<div align=\"right\">(.*?)<\/div>/sg)
{
$results{story}{editor} = Trim($1);
# print $results{editor};
}
}
## extract news reviews
# while ($content =~ /<table width=\"100%\" border=\"0\" cellpadding=\"0\" cellspacing=\"1\" bgcolor=\"\#c6c6c6\">(.*?)<\/table><\/td>/sg)
# {
# my $review_block = $1;
# my %review;
# # username
# if ($review_block =~ /<font color=red>(.*?)<\/font><\/a>/s) {
# $review{username} = SubTrim($1);
# }
# # date
# if ($review_block =~ /<img src=images\/lyb04.gif>(.*?)<a href/s) {
# $review{date} = SubTrim($1);
# }
# #IP
# if ($review_block =~ /<font color=red>(.*?)<\/font><\/a>/s) {
# $review{username} = Trim($1);
# }
# # review body
# if ($review_block =~ /<td class=\"style8\" height=5>(.*?)<tr bgcolor=\"#F5F5F5\">/sg) {
# $review{body} = SubTrim($1);
# }
# lcoation
# if ($review_block =~ /<span title\=\"Encoded location\">(.*?)<\/span>/s) {
# $review{location} = Trim($1);
# }
# push @{$results{reviews}}, \%review;
# }
## convert %results into xml
return %results;
}
sub TimeTrim{
$_ = shift;
s/^\s*//;
s/\s*$//;
return $_;
}
sub Trim {
$_ = shift;
s/<script.+?<\/script>//sgi;
s/^\s*//;
s/\s*$//;
s/>| |<|"|\s//sgi;
s/\s|&[A-Za-z]{2,8};//sgi;
s/<[^<]+>//sgi;
s/<(.|\n)+?>//sgi;
s/\r\n|\n\r|\t/\\n/sgi;
return $_;
}
sub SubTrim{
$_ = shift;
s/^\s*//;
s/\s*$//;
#Replace the singleline html tags
s/<[^<]+>//sgi;
#Replace the multiline html tags
s/<(.|\n)+?>//sgi;
#Replace the \n\r
#s/\n\r|\r\n/\\n/sgi;
#Replace the space
s/\s| //sg;
return $_;
}
1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -