⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 news.china.com.contentextract_009.pl

📁 Pelr抓取程序Pelr抓取程序Pelr抓取程序Pelr抓取程序
💻 PL
字号:
#!/usr/bin/perl
#----------------------------------------------------------------------------------
#SiteUrl:http://news.china.com/
#Editor:tianshunjun
#EditDate:2009-4-2
#IncludeClass:http://travel.china.com/
#EditRemark:
#
#CheckMan: Gao Jianhai
#CheckDate: 2009-4-15
#CheckRemark: 
#
#SubmitMan: Gao Jianhai
#SubmitDate: 2009-4-15
#SubmitRemark: 
#
#ModifyMan:
#ModifyDate:
#ModifyRemark:
#----------------------------------------------------------------------------------

# extract reviews for pages at news.cqnews.net 
# this subroutine is different from site to site                                 
sub ContentExtract {
    my $content = shift;

    my %results;  # a hash of {story|reviews},
                  # while $results{story} is a hash of {url|title|date\body|author|email|location|...}
                  # $results{reviews} is array of hashes: {url|title|date\body|author|email|location|...}
   
    use Date::Parse;

    ## extract news story
    if ($content =~ /<div id=\"chan_newsTitle\">(.*?)<td class=\"pageNum\">/isg) 
    {
				my $story_block = $&;
				
				# title
				if ($story_block =~ /<div id=\"chan_newsTitle\">(.*?)</div>/s)
				{
				    $results{story}{title} = Trim($1);
#				    print $results{story}{title};
				}	
				# author
#				if ($story_block =~ /<\/p><p class=mtext><\/td><\/tr>(.*?)<\/td><\/tr><TR><td width=590 bgcolor=\#f2f2f2>/s) 
#				{
#				    $results{story}{author} = SubTrim($1);
#				}
							
				#Time =~ /(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})  
			  if ($story_block =~ /(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})/is)  
				{
					$results{story}{Date} = TimeTrim($1);
#					print ${Date};
				}

#				# date
#				if ($story_block =~ /<center><font color=red>(.*?)<\/font><\/center>/s) 
#				{
#			      $results{story}{date} = Trim($1);
#			  }

				# story body
				if ($story_block =~ /<div id=\"chan_newsDetail\">(.*?)<style type=text\/css>/is) 
				{			
					 my $Temp = $1;
					 if ($Temp =~ /<div id=\"otherHotNews\">(.*?)<\/div>/s ) 
					 {
					   $Temp = $';   #'                                                                                          
				 };                         
				    $results{story}{body} = Trim($Temp);
#				    print $results{body};
				}	

#				# Editor
				if ($story_block =~ /<div align=\"right\">(.*?)<\/div>/sg) 
				{				   
				    $results{story}{editor} = Trim($1);
#				    print $results{editor};
				}	

    }

    ## extract news reviews      
#    while ($content =~ /<table width=\"100%\"  border=\"0\" cellpadding=\"0\" cellspacing=\"1\" bgcolor=\"\#c6c6c6\">(.*?)<\/table><\/td>/sg) 
#    {
#        my $review_block = $1;
#				my %review;
#        # username
#        if ($review_block =~ /<font color=red>(.*?)<\/font><\/a>/s) {
#            $review{username} = SubTrim($1);
#        }
#        # date        
#        if ($review_block =~ /<img src=images\/lyb04.gif>(.*?)<a href/s) {
#            $review{date} = SubTrim($1);
#        }
#         #IP
#        if ($review_block =~ /<font color=red>(.*?)<\/font><\/a>/s) {
#            $review{username} = Trim($1);
#        }
#        # review body
#        if ($review_block =~ /<td class=\"style8\" height=5>(.*?)<tr bgcolor=\"#F5F5F5\">/sg) {                                                                                
#            $review{body} = SubTrim($1);                                                                        
#        }          
				# lcoation
			  #      if ($review_block =~ /<span title\=\"Encoded location\">(.*?)<\/span>/s) {
			  #          $review{location} = Trim($1);                                                                       
			  #      }                                                                                                               
#        push @{$results{reviews}}, \%review;                                                                                             
#   }      

    ## convert %results into xml
    return %results;
}


sub TimeTrim{
	  $_ = shift;
    s/^\s*//;
    s/\s*$//;
    return $_;
}

sub Trim {
    $_ = shift;
	 s/<script.+?<\/script>//sgi;
     s/^\s*//;
     s/\s*$//;
	 s/&gt;|&nbsp;|&lt;|&quot;|\s//sgi;
	 s/\s|&[A-Za-z]{2,8};//sgi;
	 s/<[^<]+>//sgi;
	 s/<(.|\n)+?>//sgi;
	 s/\r\n|\n\r|\t/\\n/sgi;
    return $_;
}

sub SubTrim{
	$_ = shift;
   s/^\s*//;
   s/\s*$//;
	#Replace the singleline html tags
	s/<[^<]+>//sgi; 
	#Replace the multiline html tags
	s/<(.|\n)+?>//sgi;
	#Replace the \n\r
 	#s/\n\r|\r\n/\\n/sgi;
   #Replace the space
	s/\s|&nbsp;//sg;
	return $_;
}


1;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -