📄 snapshots.pm

📁 Insipid 是一款基于Web书签仓库。很方面的记录下各种输入输出信息。
💻 PM
字号:
#!/usr/bin/perl -w## Copyright (C) 2005 Luke Reeves## This program is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program; if not, write to the Free Software# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307# USA#package Insipid::Snapshots;use strict;use vars qw(@ISA @EXPORT);use Insipid::Config;use Insipid::Database;use Insipid::Util;use CGI qw/:standard/;use CGI::Carp qw(fatalsToBrowser);use Date::Format;use Date::Parse;use Date::Parse;use Digest::MD5 qw(md5 md5_hex);use DBI qw/:sql_types/;;use LWP::UserAgent;use HTTP::Request;use MIME::Base64;use XML::Writer;require Exporter;@ISA = qw(Exporter);@EXPORT = qw(show_snapshotsdo_snapshotdelete_snapshotexport_snapshotsshow_snapshotparsepage);my $referer = "";sub export_snapshots {	my ($writer) = (@_);	my ($sql, $sth, @rs);	# Export the objects	$writer->startTag("objects");	$sql = "select md5, url, content_type, content_length, date, content		from pagecache";	$sth = $dbh->prepare($sql);	$sth->execute();		while(@rs = $sth->fetchrow_array()) {		$writer->startTag("object",				"md5" => $rs[0],				"url" => $rs[1],				"type" => $rs[2],				"length" => $rs[3],				"date" => $rs[4]				);		$writer->characters(encode_base64($rs[5]));		$writer->endTag("object");	}	$writer->endTag("objects");		# Export the relationships 	$writer->startTag("relationships");	$sql = "select md5_parent, md5_child from pagecache_references";	$sth = $dbh->prepare($sql);	$sth->execute();	while(@rs = $sth->fetchrow_array()) {		$writer->startTag("relationship",				"parent" => $rs[0],				"child" => $rs[1]);		$writer->endTag("relationship");	}		$writer->endTag("relationships");}sub fetch_url {	my ($url) = (@_);	my $md5 = md5_hex("$url");	my $ua = LWP::UserAgent->new(timeout=>30);	if(get_option("proxy_host") ne "") {		my $proxy_host = get_option("proxy_host");		my $proxy_port = get_option("proxy_port");		$ua->proxy(['http', 'ftp'], "http://$proxy_host:$proxy_port/");	}		my $req = HTTP::Request->new(GET => $url);	if($referer ne "") { $req->header( referer => $referer ); }	my $res = $ua->request($req);	if($res->is_success) {		my $content = $res->content;		# Shove the unparsed page into the cache.		my $sql = "insert into pagecache(md5, url, content_type, content_length, content, date)" . 			" values ( ? , ? , ? , ? , ? , ? )";			my $sth = $dbh->prepare($sql);		my $ct = $res->header("Content-Type");		if(length($ct) > 50) { $ct = substr($ct, 0, 50); }		$sth->bind_param(1, $md5);		$sth->bind_param(2, $url);		$sth->bind_param(3, $ct);		$sth->bind_param(4, length($content));		# Postgres needs escaping for the binary data.		if($dbtype eq "Pg") {			$sth->bind_param(5, $content, SQL_BINARY);		} else {			$sth->bind_param(5, $content);		}		$sth->bind_param(6, time());		$sth->execute;		if($sth->err) {			# print $sth->errstr;			# print "<br />";		} else {			if($ct =~ /text\/html/i) {				print "<br />Parsing page... ";				parsepage($url, $content, $ct);				print "done.";			}		}	} else {		my $err = $res->status_line;		print "$err<br />";	}}sub show_snapshot {	my ($md5) = (@_);	my $sql = "select content_type,content,url,date,content_length		from pagecache where (md5 = ?)";			my $sth = $dbh->prepare($sql);	$sth->execute($md5);		my @row = $sth->fetchrow_array;		if(!@row) {		print "Content-Type: text/plain\r\n\r\n";		print "Can't find cached item \"$md5\"";		return;	}	# Check for IMS request.	my $ims = http('If-Modified-Since');	if($ims) { 		my $t = str2time($ims);		if($row[3] <= $t) {			# Return a 304 not modified.			print "Status: 304 Not Modified\r\n";			return;		}	}		#my $dt = time2str("%a, %d %h %Y %T GMT", $row[3]);	my $dt = ims_time($row[3]);	print "Last-Modified: $dt\r\n";	print "Content-Type: $row[0]\r\n";	if($row[0] =~ /text\/html/i) {		print "\r\n";		my $p = MyParser->new($row[2], undef);		if($row[0] =~ /utf/i) {			$p->utf8_mode(1);		}		$p->parse($row[1]);	} else {		print "Content-Length: $row[4]\r\n";		print "\r\n";		print $row[1];	}	exit;}sub show_snapshots {	# If a snapshot was asked to be deleted	if(param('delete')) {		delete_snapshot(param('delete'));	}	print "<br /><center><table cellpadding=\"5\"><tr><th>Page</th><th>";	print "Date</th><th>Size</th><th>Objects</th><th>Functions</th></tr>";	my $tcount = 0;	my $tsize = 0;		my $sql = "select pagecache.md5, bookmarks.title, pagecache.date,			pagecache.content_length, count(*)			from pagecache 			join bookmarks on (bookmarks.md5 = pagecache.md5)			left join pagecache_references on 			   (pagecache.md5 = pagecache_references.md5_parent) 			group by 			   pagecache.md5, bookmarks.title, pagecache.date,			   pagecache.content_length			order by pagecache.date desc";	my $sth = $dbh->prepare($sql);	$sth->execute;	while(my @r = $sth->fetchrow_array) {		print "<tr>";		print "<td><a href=\"$site_url/snapshot/$r[0]\">$r[1]</td>";		my $timestr = time2str("%Y-%m-%d", $r[2], "EST");		my $count = $r[4] + 1; $tcount += $count;		$tsize += $r[3];		print "<td align=\"center\">$timestr</td>";		print "<td align=\"center\">$r[3]</td>";		print "<td align=\"center\">$count</td>";		print "<td><a href=\"$site_url/insipid.cgi?op=snapshots&delete=$r[0]\">delete</a></td>";		print "</tr>";	}	print "<tr><td><b>Total</b></td><td>&nbsp;</td>";	print "<td align=\"center\"><b>$tsize</b></td>";	print "<td align=\"center\"><b>$tcount</b></td><td>&nbsp;</td></tr>";		print "</table></center>";}# Deletes a snapshot and all orphan cache children, taking into# account the fact that items can be shared across cached pages.## This is horribly expensive, and someday I'll replace it with# a much nicer function.sub delete_snapshot {	my ($md5) = (@_);		# The snapshot	my $sql = "delete from pagecache where (md5 = ?)";	my $delstatement = $dbh->prepare($sql);	$delstatement->execute($md5);	# References	$sql = "delete from pagecache_references where (md5_parent = ?)";	my $sth = $dbh->prepare($sql);	$sth->execute($md5);		# Orpans - blow away any md5s in the pagecache table that aren't 	# referenced as a child in the references table. First, get a list	# of valid MD5s.	$sql = "select distinct md5_child from pagecache_references";	$sth = $dbh->prepare($sql);	$sth->execute();	my $subquery = "";	while(my @r = $sth->fetchrow_array) {		if($subquery ne "") { $subquery = $subquery . ","; }		$subquery = "$subquery '$r[0]'";	}		$sql = "select distinct md5_parent from pagecache_references";	$sth = $dbh->prepare($sql);	$sth->execute();	while(my @r = $sth->fetchrow_array) {		if($subquery ne "") { $subquery = $subquery . ","; }		$subquery = "$subquery '$r[0]'";	}	if($subquery eq "") {		$sql = "delete from pagecache;";	} else {		$sql = "delete from pagecache where md5 not in ($subquery)";	}		$sth = $dbh->prepare($sql);	$sth->execute();}sub do_snapshot {	# Save the page.	print "<br /><br />";		my ($bookmark_id) = (@_);	my $sql = "select url,md5,title from bookmarks where (id = ?)";        my $sth = $dbh->prepare($sql);	$sth->execute($bookmark_id);	my @row = $sth->fetchrow_array;	if(@row) {		print "<p>Fetching \"<b>$row[2]</b>\"...</p>\n";		$referer = $row[0];		fetch_url(@row);		} else {		die "Couldn't find the row for id $bookmark_id!";	}}sub parsepage {	my ($url, $content, $content_type) = (@_);	my $p = MyParser->new($url, \&fetch_url);	if($content_type =~ /utf/i) { 		$p->utf8_mode(1);	}	$p->parse($content);}## "use MyParser;" ## TODO: Make this a separate file.BEGIN {	package MyParser;	use HTML::Parser;	use HTML::Entities ();	use URI::URL;	use Digest::MD5 qw(md5 md5_hex);	use Insipid::Config;	use Insipid::Database;	use vars qw(@ISA);	@ISA = qw(HTML::Parser);	sub new {		my $pack = shift;		my $self = $pack->SUPER::new;		@{$self}{qw(__base __grabit)} = @_;		$self;	}	sub declaration {		my $self = shift;		my ($decl) = @_;	}	sub start {		my $self = shift;		my ($tag, $attr, $attrseq, $origtext) = @_;		if(!defined($self->{__grabit})) {			print("<$tag");		}		for (keys %$attr) {			my $val = $attr->{$_};			if(($_ eq "/") && ($val = "/")) { next; }			if(!defined($self->{__grabit})) { print(" $_=\""); }			if( "$tag $_" =~ /^(link href|img src)$/) {				$val = url($val)->abs($self->{__base},1);					if(!defined($self->{__grabit})) {					if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) {						my $md5 = md5_hex("$val");						$val = "$site_url/snapshot/$md5";					}				} else {					# JPG, GIF, PNG and CSS					if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) {						join_urls($self->{__base}, $val);						$val = $self->{__grabit}($val, $1);					}				}			}			if(!defined($self->{__grabit})) {				#print(HTML::Entities::encode($val, '<>&"'));				print("$val\"");			}		}		if(!defined($self->{__grabit})) { print(">"); }	}	sub end {		my $self = shift;		my ($tag) = @_;		if(!defined($self->{__grabit})) { print("</$tag>"); }	}	sub text {		my $self = shift;		my ($text) = @_;		if(!defined($self->{__grabit})) { print("$text"); }	}	sub comment {		my $self = shift;		my ($comment) = @_;		if(!defined($self->{__grabit})) { print("<!-- $comment -->"); }	}		sub join_urls {		my($parent, $child) = (@_);		my $sql = "insert into pagecache_references(md5_parent, md5_child) values(?, ?)";		my $sth = $dbh->prepare($sql);		$sth->execute(md5_hex("$parent"), md5_hex("$child"));		if($sth->err) {			# ignore errors for now		}	}}## end "use MyParser;" ##1;__END__
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -