📄 snapshots.pm
字号:
#!/usr/bin/perl -w## Copyright (C) 2005 Luke Reeves## This program is free software; you can redistribute it and/or modify# it under the terms of the GNU General Public License as published by# the Free Software Foundation; either version 2 of the License, or# (at your option) any later version.## This program is distributed in the hope that it will be useful,# but WITHOUT ANY WARRANTY; without even the implied warranty of# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the# GNU General Public License for more details.## You should have received a copy of the GNU General Public License# along with this program; if not, write to the Free Software# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307# USA#package Insipid::Snapshots;use strict;use vars qw(@ISA @EXPORT);use Insipid::Config;use Insipid::Database;use Insipid::Util;use CGI qw/:standard/;use CGI::Carp qw(fatalsToBrowser);use Date::Format;use Date::Parse;use Date::Parse;use Digest::MD5 qw(md5 md5_hex);use DBI qw/:sql_types/;;use LWP::UserAgent;use HTTP::Request;use MIME::Base64;use XML::Writer;require Exporter;@ISA = qw(Exporter);@EXPORT = qw(show_snapshotsdo_snapshotdelete_snapshotexport_snapshotsshow_snapshotparsepage);my $referer = "";sub export_snapshots { my ($writer) = (@_); my ($sql, $sth, @rs); # Export the objects $writer->startTag("objects"); $sql = "select md5, url, content_type, content_length, date, content from pagecache"; $sth = $dbh->prepare($sql); $sth->execute(); while(@rs = $sth->fetchrow_array()) { $writer->startTag("object", "md5" => $rs[0], "url" => $rs[1], "type" => $rs[2], "length" => $rs[3], "date" => $rs[4] ); $writer->characters(encode_base64($rs[5])); $writer->endTag("object"); } $writer->endTag("objects"); # Export the relationships $writer->startTag("relationships"); $sql = "select md5_parent, md5_child from pagecache_references"; $sth = $dbh->prepare($sql); $sth->execute(); while(@rs = $sth->fetchrow_array()) { $writer->startTag("relationship", "parent" => $rs[0], "child" => $rs[1]); $writer->endTag("relationship"); } $writer->endTag("relationships");}sub fetch_url { my ($url) = (@_); my $md5 = md5_hex("$url"); my $ua = LWP::UserAgent->new(timeout=>30); if(get_option("proxy_host") ne "") { my $proxy_host = get_option("proxy_host"); my $proxy_port = get_option("proxy_port"); $ua->proxy(['http', 'ftp'], "http://$proxy_host:$proxy_port/"); } my $req = HTTP::Request->new(GET => $url); if($referer ne "") { $req->header( referer => $referer ); } my $res = $ua->request($req); if($res->is_success) { my $content = $res->content; # Shove the unparsed page into the cache. my $sql = "insert into pagecache(md5, url, content_type, content_length, content, date)" . " values ( ? , ? , ? , ? , ? , ? )"; my $sth = $dbh->prepare($sql); my $ct = $res->header("Content-Type"); if(length($ct) > 50) { $ct = substr($ct, 0, 50); } $sth->bind_param(1, $md5); $sth->bind_param(2, $url); $sth->bind_param(3, $ct); $sth->bind_param(4, length($content)); # Postgres needs escaping for the binary data. if($dbtype eq "Pg") { $sth->bind_param(5, $content, SQL_BINARY); } else { $sth->bind_param(5, $content); } $sth->bind_param(6, time()); $sth->execute; if($sth->err) { # print $sth->errstr; # print "<br />"; } else { if($ct =~ /text\/html/i) { print "<br />Parsing page... "; parsepage($url, $content, $ct); print "done."; } } } else { my $err = $res->status_line; print "$err<br />"; }}sub show_snapshot { my ($md5) = (@_); my $sql = "select content_type,content,url,date,content_length from pagecache where (md5 = ?)"; my $sth = $dbh->prepare($sql); $sth->execute($md5); my @row = $sth->fetchrow_array; if(!@row) { print "Content-Type: text/plain\r\n\r\n"; print "Can't find cached item \"$md5\""; return; } # Check for IMS request. my $ims = http('If-Modified-Since'); if($ims) { my $t = str2time($ims); if($row[3] <= $t) { # Return a 304 not modified. print "Status: 304 Not Modified\r\n"; return; } } #my $dt = time2str("%a, %d %h %Y %T GMT", $row[3]); my $dt = ims_time($row[3]); print "Last-Modified: $dt\r\n"; print "Content-Type: $row[0]\r\n"; if($row[0] =~ /text\/html/i) { print "\r\n"; my $p = MyParser->new($row[2], undef); if($row[0] =~ /utf/i) { $p->utf8_mode(1); } $p->parse($row[1]); } else { print "Content-Length: $row[4]\r\n"; print "\r\n"; print $row[1]; } exit;}sub show_snapshots { # If a snapshot was asked to be deleted if(param('delete')) { delete_snapshot(param('delete')); } print "<br /><center><table cellpadding=\"5\"><tr><th>Page</th><th>"; print "Date</th><th>Size</th><th>Objects</th><th>Functions</th></tr>"; my $tcount = 0; my $tsize = 0; my $sql = "select pagecache.md5, bookmarks.title, pagecache.date, pagecache.content_length, count(*) from pagecache join bookmarks on (bookmarks.md5 = pagecache.md5) left join pagecache_references on (pagecache.md5 = pagecache_references.md5_parent) group by pagecache.md5, bookmarks.title, pagecache.date, pagecache.content_length order by pagecache.date desc"; my $sth = $dbh->prepare($sql); $sth->execute; while(my @r = $sth->fetchrow_array) { print "<tr>"; print "<td><a href=\"$site_url/snapshot/$r[0]\">$r[1]</td>"; my $timestr = time2str("%Y-%m-%d", $r[2], "EST"); my $count = $r[4] + 1; $tcount += $count; $tsize += $r[3]; print "<td align=\"center\">$timestr</td>"; print "<td align=\"center\">$r[3]</td>"; print "<td align=\"center\">$count</td>"; print "<td><a href=\"$site_url/insipid.cgi?op=snapshots&delete=$r[0]\">delete</a></td>"; print "</tr>"; } print "<tr><td><b>Total</b></td><td> </td>"; print "<td align=\"center\"><b>$tsize</b></td>"; print "<td align=\"center\"><b>$tcount</b></td><td> </td></tr>"; print "</table></center>";}# Deletes a snapshot and all orphan cache children, taking into# account the fact that items can be shared across cached pages.## This is horribly expensive, and someday I'll replace it with# a much nicer function.sub delete_snapshot { my ($md5) = (@_); # The snapshot my $sql = "delete from pagecache where (md5 = ?)"; my $delstatement = $dbh->prepare($sql); $delstatement->execute($md5); # References $sql = "delete from pagecache_references where (md5_parent = ?)"; my $sth = $dbh->prepare($sql); $sth->execute($md5); # Orpans - blow away any md5s in the pagecache table that aren't # referenced as a child in the references table. First, get a list # of valid MD5s. $sql = "select distinct md5_child from pagecache_references"; $sth = $dbh->prepare($sql); $sth->execute(); my $subquery = ""; while(my @r = $sth->fetchrow_array) { if($subquery ne "") { $subquery = $subquery . ","; } $subquery = "$subquery '$r[0]'"; } $sql = "select distinct md5_parent from pagecache_references"; $sth = $dbh->prepare($sql); $sth->execute(); while(my @r = $sth->fetchrow_array) { if($subquery ne "") { $subquery = $subquery . ","; } $subquery = "$subquery '$r[0]'"; } if($subquery eq "") { $sql = "delete from pagecache;"; } else { $sql = "delete from pagecache where md5 not in ($subquery)"; } $sth = $dbh->prepare($sql); $sth->execute();}sub do_snapshot { # Save the page. print "<br /><br />"; my ($bookmark_id) = (@_); my $sql = "select url,md5,title from bookmarks where (id = ?)"; my $sth = $dbh->prepare($sql); $sth->execute($bookmark_id); my @row = $sth->fetchrow_array; if(@row) { print "<p>Fetching \"<b>$row[2]</b>\"...</p>\n"; $referer = $row[0]; fetch_url(@row); } else { die "Couldn't find the row for id $bookmark_id!"; }}sub parsepage { my ($url, $content, $content_type) = (@_); my $p = MyParser->new($url, \&fetch_url); if($content_type =~ /utf/i) { $p->utf8_mode(1); } $p->parse($content);}## "use MyParser;" ## TODO: Make this a separate file.BEGIN { package MyParser; use HTML::Parser; use HTML::Entities (); use URI::URL; use Digest::MD5 qw(md5 md5_hex); use Insipid::Config; use Insipid::Database; use vars qw(@ISA); @ISA = qw(HTML::Parser); sub new { my $pack = shift; my $self = $pack->SUPER::new; @{$self}{qw(__base __grabit)} = @_; $self; } sub declaration { my $self = shift; my ($decl) = @_; } sub start { my $self = shift; my ($tag, $attr, $attrseq, $origtext) = @_; if(!defined($self->{__grabit})) { print("<$tag"); } for (keys %$attr) { my $val = $attr->{$_}; if(($_ eq "/") && ($val = "/")) { next; } if(!defined($self->{__grabit})) { print(" $_=\""); } if( "$tag $_" =~ /^(link href|img src)$/) { $val = url($val)->abs($self->{__base},1); if(!defined($self->{__grabit})) { if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) { my $md5 = md5_hex("$val"); $val = "$site_url/snapshot/$md5"; } } else { # JPG, GIF, PNG and CSS if($val =~ /(\.gif|\.jpg|\.png|\.css)$/i) { join_urls($self->{__base}, $val); $val = $self->{__grabit}($val, $1); } } } if(!defined($self->{__grabit})) { #print(HTML::Entities::encode($val, '<>&"')); print("$val\""); } } if(!defined($self->{__grabit})) { print(">"); } } sub end { my $self = shift; my ($tag) = @_; if(!defined($self->{__grabit})) { print("</$tag>"); } } sub text { my $self = shift; my ($text) = @_; if(!defined($self->{__grabit})) { print("$text"); } } sub comment { my $self = shift; my ($comment) = @_; if(!defined($self->{__grabit})) { print("<!-- $comment -->"); } } sub join_urls { my($parent, $child) = (@_); my $sql = "insert into pagecache_references(md5_parent, md5_child) values(?, ?)"; my $sth = $dbh->prepare($sql); $sth->execute(md5_hex("$parent"), md5_hex("$child")); if($sth->err) { # ignore errors for now } }}## end "use MyParser;" ##1;__END__
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -