📄 create_db.pl

📁 harvest是一个下载html网页得机器人
💻 PL
字号:
#!/usr/bin/perl## kjl/29dec2002## create_db.pl##   Create miscellaneous databases for bibliometric ranking#   and other purposes.## Usage:##   create_db.pl [-H path]##     where path is the path to the broker, usually#     /usr/local/harvest/brokers/YOUR_BROKER##     You can invoke it directly at command line or#     embed it into glimpseindex script.## Add the perl library path to the @INC path@P = split('/', $0); pop @P; $medir = join('/',@P);unshift(@INC, $medir);use DB_File;require 'normal.pm';## Edit below if necessary##$DEBUG = 1;# Where to put the database files$db_rev   = "db.ref_rev";$db_url   = "db.ref_url";$db_cache = "db.ref_cache";$db_clean = "db.ref_clean";# Default broker directory. This can be overriden by# "-H /your/path" argument.$BROKERDIR = "/usr/local/harvest/brokers/tengu.local";# End of configuration# No changes necessary below# Where is the broker directory$DATADIR = parse_arg ();if ($DATADIR eq "") {    $DATADIR = $BROKERDIR;}chdir $DATADIR or die "Can't chdir to $DATADIR: $!";unlink_db_file ();tie_db ();put_log ("Parsing objects");parse_obj ();put_log ("Building cache databases");build_cache ();put_log ("Building cleaned up database");clean_base ();untie_db ();put_log ("Finished");# End of program## Parse arguments and fetch the broker directory#sub parse_arg {    my $i = 0;    my $datadir = "";    foreach (@ARGV) {	$i++;	last if /\-H/o;    }    $datadir = $ARGV[$i] if ($i <= $#ARGV);    return $datadir;}## remove database files#sub unlink_db_file {    unlink $db_fwd, $db_rev, $db_url, $db_clean, $db_cache;}## open databases#sub tie_db {    # Allow duplicates in BTREE databases.    $DB_BTREE->{'flags'} = R_DUP;    $REV_STAT = tie %rev_hash,     'DB_File', $db_rev,     O_RDWR|O_CREAT, 0666, $DB_BTREE	or die "Can't open DB file $db_rev: $!";    $URL_STAT = tie %url_hash,     'DB_File', "$db_url",   O_RDWR|O_CREAT, 0666, $DB_HASH	or die "Can't open DB file $db_url: $!";    $CLEAN_STAT = tie %clean_hash, 'DB_File', "$db_clean", O_RDWR|O_CREAT, 0666, $DB_BTREE	or die "Can't open DB file $db_clean: $!";    $CACHE_STAT = tie %cache_hash, 'DB_File', "$db_cache", O_RDWR|O_CREAT, 0666, $DB_HASH	or die "Can't open DB file $db_cache: $!";}## close databases#sub untie_db {    undef $REV_STAT;    untie %rev_hash;    undef $URL_STAT;    untie %url_hash;    undef $CLEAN_STAT;    untie %clean_hash;    undef $CACHE_STAT;    untie %cache_hash;}## print current local time#sub print_time {    my $now = localtime();    print "$now";}## log string#sub put_log {    my $log = shift;    print_time ();    print "\t$log\n";}## recurse through broker's object directory and create database#sub parse_obj {    my $OBJDIR = "objects";    my ($dir, $obj);    opendir (DIR, "$OBJDIR");    while ($dir = readdir (DIR)) {	next if (($dir eq "\.") or ($dir eq "\.\."));	opendir (SUBDIR, "$OBJDIR/$dir");	while ($obj = readdir (SUBDIR)) {	    next if (($obj eq "\.") or ($obj eq "\.\."));	    process_obj ("$OBJDIR/$dir/$obj");	}	closedir (SUBDIR);    }    closedir (DIR);}## process one object#sub process_obj {    my $obj = shift;    my ($url, $ttype, $ref, @ref_ary, $proto, $t);    my ($host, $path, $file, $prev);    open (IN, "<$obj") or die "Can't open $obj: $!";    ($url, $ttype, $ref) = extract_ref();    close (IN);    chop $url;    @ref_ary = sort (split /\n/, $ref);    next if (($url eq "") || ($ttype eq "") || ($ref_ary[0] eq "")	     || (($ttype ne "HTML") && ($ttype ne "HTTP-Query")));    print "$obj\t$url\t$ttype\n";    # ($proto, $host, $path, $file) = $url =~ /(\S+):\/\/(\S+?)\/(.*?)\/(.*)/;    ($proto, $t)   = $url =~ /(\w+?):(.*)/;    ($host)        = $t   =~ /\/\/(\S+)/;    if ($host =~ /\//) {	($host,  $t)   = $host   =~ /(\S+?)\/(.*)/;	($path, $file) = $t   =~ /(.*)\/(.*)/;    }    $prev = "";    while ($_ = shift @ref_ary) {	next if /^#/;	s/(\S+)#.*/$1/;	if (/\w+:\S+/) {	    $link = $_;	} else {	    $link = "http://$host/$path/$_";	}	$link = normal::normalize_uri ($link);	next if ($prev eq $link);	$prev = $link;	build_db ($url, $link);    }}## Build databases#sub build_db {    my $url = shift;    my $link = shift;    $rev_hash {$link} = $url;    $url_hash {$url} = 1;  # any value, we need just exists $key}## Build a db with URL -> number of references to the URL.# This currently only counts direct hits only.#sub build_cache {    my $key, $n;    foreach $key (keys %url_hash) {	$n = $REV_STAT->get_dup($key);	print "$n\t$key\n";	$cache_hash {$key} = $n if ($n > 1);    }}## Build a databse containing only URLs, which is also in the Broker.#sub clean_base {    my @ref_ary;    foreach $key (keys %url_hash) {	if (exists $rev_hash {$key}) {	    print "Copying $key->\n";	    @ref_ary = $REV_STAT->get_dup ($key);	    foreach (@ref_ary) {		print "\t$_\n";		$clean_hash {$key} = $_;	    }	} else {	    print "$key\tignored\n";	}    }}## extract references from object#sub extract_ref {    my ($ttype, $url, %SOIF) = soif_parse();    # We only need FILE.    return if ($ttype ne "FILE");    return if (!exists $SOIF{'type'});    return if (!exists $SOIF{'url-references'});    return ("$url\n", $SOIF{'type'}, $SOIF{'url-references'});}##  This is from soif.pl.#  soif_parse - Returns an associative array containing the SOIF,#		the template type, and the URL.#sub soif_parse {	print "Inside soif_parse.\n" if ($debug);        return () if (eof(IN));       # DW	my $template_type = "UNKNOWN";	my $url = "UNKNOWN";	my %SOIF;	undef %SOIF;	my ($attr, $vsize, $value, $end_value);	while (<IN>) {		print "READING input line: $_\n" if ($debug);		last if (/^\@\S+\s*{\s*\S+\s*$/o);	}	if (/^\@(\S+)\s*{\s*(\S+)\s*$/o) {		$template_type = $1, $url = $2;	} else {		return ($template_type, $url, %SOIF);	# done	}	while (<IN>) {                if (/^\s*([^{]+){(\d+)}:\t(.*\n)/o) {			$attr = $1;			$vsize = $2;			$value = $3;			if (length($value) < $vsize) {				$nleft = $vsize - length($value);				$end_value = "";				$x = read(IN, $end_value, $nleft);				die "Cannot read $nleft bytes: $!"					if ($x != $nleft);				$value .= $end_value;				undef $end_value;			}                        chop ($value) if ($value =~ /\n$/);                        $SOIF{$attr} = $value;			undef $value;			undef $end_value;			next;		}		last if (/^}/o);	}	return ($template_type, $url, %SOIF);}=unused## return file name of a string#sub basename {    my $path = shift;    $path =~ /.*\/(.*)/;    return $1;}## return directory name of a string#sub dirname {    my $path = shift;    $path =~ /(.*)\/.*/;    return $1;}=cut
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -