📄 check-~1.pl
字号:
#! /usr/bin/perl -w## Copyright 1999/2000 August Hoerandl (august.hoerandl@gmx.at)# http://elina.htlw16.ac.at/~hoerandl## parts from search.pl which is# Copyright 1999 Jose M. Vidal# Jose M. Vidal, vidal@multiagent.com, http://jmvidal.ece.sc.edu## This program is free software. You can redistribute it and/or modify# it under the terms of the GNU General Public License## create database with meta info ## actions# read database# read to @allRecords# read metadata# read to $meta{$URL}, $date{$URL}, $error{$URL} # check all (expired) urls# based on $date{$URL}# dump metadata # dump database with meta info# dump errordata## History# 0.1 # 4. Sept. 1999 - first version# 0.2# 12. Sept. 1999 - little bug with multiple line meta corrected# added option & getoptions# 0.3# 21. Sept. 1999 - merged back file detection from Jose## 0.4# 26. Oct 1999 - added code for: meta Classification# 31. Oct 1999 - added noproxy, showrecheck, $time, error.html## 0.5 # 12. Nov 1999 - added checkheadline# strip metainfo (multiple blanks, html)# convert iso# # 26. Nov. 1999 - added $urlto, $urlmaxsize# 29. Nov. 1999 - added $agent# 23. Jan. 2000 - fixed bug which used up all memory # added <meta value="..."> and <meta contents="..."># 10. Feb. 2000 - use LWP::Parallel::UserAgent# http://theoryx5.uwinnipeg.ca/CPAN/data/ParallelUserAgent/Bundle/ParallelUA.html# install: perl -MCPAN -e 'install Bundle::ParallelUA' # --maxpar number of host to query in parallel ## 11. Feb. 2000 - fixed dump_database (replaces $error)## 20. Feb. 2000 - fixed make_url # (thanks to Andreas Hille for beta testing)# urlmaxsize to handle very big pages# urlmaxresponse to handle endless small answers # 26. Feb. 2000 - added sorting (errorcode) to dump_errordata_html## 4. Mar. 2000 - added $req->previous / $urldata to allow redirection## 22. Mar. 2000 - added sorting to dump_errordata_html (errorcount)$version = "0.65";$database = "/usr/local/httpd/cgi-bin/urls.db"; # Name of db file.$newdb = ""; # result: database$metadata = "meta.db"; # result: metainfo$errordata = "error.db"; # result: urls with error$error_recheck = 10; # number of days for recheck after error$age_recheck = 40; # number of days for recheck # add some randomness - so we won't fetch all urls on the same day $random = .5; # random number of days added (factor)$dumpfreq = 10; # when to dump metadata (number of urls read) $maxurls = 2; # max url to fetch recursive (frames)$maxcheck = 1000; # max number of urls to check in one run$sleep = 2; # time to sleep between url requests - not supported !!!$debug = 0; # set to 1 for debug output$showrecheck = 0; # set to 1 to see next recheck$maxpar = 10; # maximum requests in parallel$showhash = 1; # show hashes on progress$checkerror = 0; # check all urls with errorcount > 0$reseterror = 0; # reset all error counters - only with $checkerror$checknometa = 0; # check all urls with meta data = ""$checkheadline = 0; # add <h1>, <h2>, ... to meta info - see $whichheadline$whichheadline = "1-2"; # which headlines to check$urlto = 0; # timeout for getting url (seconds)$urlmaxsize = 0; # max size of page to get$urlmaxresponse = 500; # max number of responses (parts)# useragent to send in http request ("spider name")$agent = "check-meta - spider"; $noproxy = 0; # dont use proxy from environment# lines shouldn磘 get too long $maxlinelength = 4*1024; # max line length during get meta$test1 = 0; # simple test$test2 = 0; # more tests$test3 = 0; # even more tests$test4 = 0; # even more tests# ------------------------------------------------------------------# no user serviceable parts below# ------------------------------------------------------------------use LWP;use HTTP::Request::Common;use Getopt::Long;require LWP::Parallel::UserAgent;use HTTP::Request; use LWP::Parallel::UserAgent qw(:CALLBACK);#use LWP::Debug qw(+);#require LWP::Debug;# ------------------------------------------------------------------$numlines_bk2site = 9; # number of lines - standard$numlines_metadata = 10; # number of lines - with metadata# ------------------------------------------------------------------$countmeta = 0;$counthead = 0;# ------------------------------------------------------------------sub norm_url { local($url) = @_; my $req = HTTP::Request->new('GET', $url); return $req->url;}# ------------------------------------------------------------------sub read_database { open(DB,"<$database") || die ("Can't open $database; $!"); $db_line1 = <DB>; chop($db_line1); if ($db_line1 eq "#bk2site urls.db generated file"){ printf "bk2site urls.db generated file\n" if $debug; $numlines = $numlines_bk2site; } else { printf "bk2site urls.db has already metadata\n" if $debug; $numlines = $numlines_metadata; }; $db_line2 = <DB>; $db_line3 = <DB>; $db_line4 = <DB>; @allRecords = <DB>; # there are numlines lines per record, # so mutiply index by 8 close(DB); ## 0 is the type ## 1 is the ParentTitle ## 2 is the url or relative dir ## 3 is the Title ## 4 is the comment ## 5 is the creation time ## 6 is the last modified time ## 7 is the last visit time ## 8 is the number of user hits ## 9 is the meta info $typeN = 0; # $parentTitleN = 1; $urlN = 2; # $titleN = 3; # $commentN = 4; # $creationtimeN = 5; # $modifiedtimeN = 6; # $visittimeN = 7; # $urlhitsN = 8; # $numlines = 10; ##0-9, as above # my $numRecords = 15; # test only my $numRecords = ($#allRecords + 1)/$numlines; my $i; for ($i=0; $i < $numRecords; ++$i){ #initialize meta my $cr = ($i * $numlines); next if (($allRecords[$cr + $typeN]) eq "FOLDER\n"); my $url = $allRecords[$cr + $urlN]; chop($url); $url = &norm_url($url); print "url $url added\n" if $debug; $error{$url} = 0; $date{$url} = 0; $meta{$url} = ""; }}sub dump_database { print "\ndumping database $newdb\n"; open(DB,">$newdb") || die ("Can't open $newdb; $!");# print DB "#bk2site urls.db generated file metadata"; if ($db_line1 eq "#bk2site urls.db generated file") { print DB "#bk2site urls.db generated file metadata\n"; } else { print DB "$db_line1\n"; } print DB "$db_line2"; print DB "$db_line3"; print DB "$db_line4"; my $numRecords = ($#allRecords + 1)/$numlines; for ($i=0; $i < $numRecords; $i++) { my $cr = ($i * $numlines); my $url = $allRecords[$cr + $urlN]; chop($url); for($j = 0; $j < $numlines_bk2site; $j++) { print DB "$allRecords[$cr + $j]"; } if (defined $error{$url} && $error{$url} > 0) { print DB "\n"; } elsif (defined $meta{$url}) { print DB "$meta{$url}\n"; } else { print DB "\n"; } } close DB;}# ------------------------------------------------------------------# add url + filenamesub make_url { local($url, $file) = @_; print "\nmake url [$url][$file]\n" if $debug; if ( $file =~ /^http/) { # full url return $file; } elsif ( $file =~ /^\//) { # filename starting with / $url =~ /(.*\/\/[^\/]*)\//; # get only host part return $1 . $file; } if ( $url =~ /\/$/ ) { # url ends with / return $url . $file; } elsif( $url =~ /(.*\/)[^\/]+/ ) { # url with ..../file.ext # print "+\n"; return $1 . $file; } else { return $url . "/" . $file; }}sub conv_iso { local($line) = @_; $line =~ s/Æ/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -