📄 zqb.cyol.com.contentextract_003.pl
字号:
#!/usr/bin/perl#----------------------------------------------------------------------------------#SiteUrl:http://zqb.cyol.com#Editor:tianshunjun#EditDate:2009-3-27#IncludeClass:http://news.cyol.com,http://edu.cyol.com#EditRemark:##CheckMan: Gao Jianhai#CheckDate: 2009-4-15#CheckRemark: ##SubmitMan: Gao Jianhai#SubmitDate: 2009-4-15#SubmitRemark: ##ModifyMan:#ModifyDate:#ModifyRemark:#----------------------------------------------------------------------------------# extract reviews for pages at news.cqnews.net # this subroutine is different from site to site sub ContentExtract { my $content = shift; my %results; # a hash of {story|reviews}, # while $results{story} is a hash of {url|title|date\body|author|email|location|...} # $results{reviews} is array of hashes: {url|title|date\body|author|email|location|...} use Date::Parse; ## extract news story if ($content =~ /<div id=\"ArtTitle\">(.*?)<!--\/enpcontent-->/isg) { my $story_block = $&; # title if ($story_block =~ /<div id=\"ArtTitle\">(.*?)<div id=\"ArtData\">/s) { $results{story}{title} = Trim($1);# print $results{story}{title}; } # author# if ($story_block =~ /<\/p><p class=mtext><\/td><\/tr>(.*?)<\/td><\/tr><TR><td width=590 bgcolor=\#f2f2f2>/s) # {# $results{story}{author} = SubTrim($1);# } #Time =~ /(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}) if ($story_block =~ /(\d{4}-\d{2}-\d{2})/is) { $results{story}{Date} = TimeTrim($1);# print ${Date}; }# # date# if ($story_block =~ /<center><font color=red>(.*?)<\/font><\/center>/s) # {# $results{story}{date} = Trim($1);# } # story body if ($story_block =~ /<!--enpcontent-->(.*?)<!--\/enpcontent-->/is) { $results{story}{body} = Trim($1);# print $results{body}; } # # Editor# if ($story_block =~ /<div class="cupage">(.*?)class="f14">/sg) # { # $results{story}{editor} = Trim($1);# print $results{editor};# } } ## extract news reviews # while ($content =~ /<table width=\"100%\" border=\"0\" cellpadding=\"0\" cellspacing=\"1\" bgcolor=\"\#c6c6c6\">(.*?)<\/table><\/td>/sg) # {# my $review_block = $1;# my %review;# # username# if ($review_block =~ /<font color=red>(.*?)<\/font><\/a>/s) {# $review{username} = SubTrim($1);# }# # date # if ($review_block =~ /<img src=images\/lyb04.gif>(.*?)<a href/s) {# $review{date} = SubTrim($1);# }# #IP# if ($review_block =~ /<font color=red>(.*?)<\/font><\/a>/s) {# $review{username} = Trim($1);# }# # review body# if ($review_block =~ /<td class=\"style8\" height=5>(.*?)<tr bgcolor=\"#F5F5F5\">/sg) { # $review{body} = SubTrim($1); # } # lcoation # if ($review_block =~ /<span title\=\"Encoded location\">(.*?)<\/span>/s) { # $review{location} = Trim($1); # } # push @{$results{reviews}}, \%review; # } ## convert %results into xml return %results;}sub TimeTrim{ $_ = shift; s/^\s*//; s/\s*$//; return $_;}sub Trim { $_ = shift; s/<script.+?<\/script>//sgi; s/^\s*//; s/\s*$//; s/>| |<|"|\s//sgi; s/\s|&[A-Za-z]{2,8};//sgi; s/<[^<]+>//sgi; s/<(.|\n)+?>//sgi; s/\r\n|\n\r|\t/\\n/sgi; return $_;}sub SubTrim{ $_ = shift; s/^\s*//; s/\s*$//; #Replace the singleline html tags s/<[^<]+>//sgi; #Replace the multiline html tags s/<(.|\n)+?>//sgi; #Replace the \n\r #s/\n\r|\r\n/\\n/sgi; #Replace the space s/\s| //sg; return $_;}1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -