📄 create_db.pl
字号:
#!/usr/bin/perl## kjl/29dec2002## create_db.pl## Create miscellaneous databases for bibliometric ranking# and other purposes.## Usage:## create_db.pl [-H path]## where path is the path to the broker, usually# /usr/local/harvest/brokers/YOUR_BROKER## You can invoke it directly at command line or# embed it into glimpseindex script.## Add the perl library path to the @INC path@P = split('/', $0); pop @P; $medir = join('/',@P);unshift(@INC, $medir);use DB_File;require 'normal.pm';## Edit below if necessary##$DEBUG = 1;# Where to put the database files$db_rev = "db.ref_rev";$db_url = "db.ref_url";$db_cache = "db.ref_cache";$db_clean = "db.ref_clean";# Default broker directory. This can be overriden by# "-H /your/path" argument.$BROKERDIR = "/usr/local/harvest/brokers/tengu.local";# End of configuration# No changes necessary below# Where is the broker directory$DATADIR = parse_arg ();if ($DATADIR eq "") { $DATADIR = $BROKERDIR;}chdir $DATADIR or die "Can't chdir to $DATADIR: $!";unlink_db_file ();tie_db ();put_log ("Parsing objects");parse_obj ();put_log ("Building cache databases");build_cache ();put_log ("Building cleaned up database");clean_base ();untie_db ();put_log ("Finished");# End of program## Parse arguments and fetch the broker directory#sub parse_arg { my $i = 0; my $datadir = ""; foreach (@ARGV) { $i++; last if /\-H/o; } $datadir = $ARGV[$i] if ($i <= $#ARGV); return $datadir;}## remove database files#sub unlink_db_file { unlink $db_fwd, $db_rev, $db_url, $db_clean, $db_cache;}## open databases#sub tie_db { # Allow duplicates in BTREE databases. $DB_BTREE->{'flags'} = R_DUP; $REV_STAT = tie %rev_hash, 'DB_File', $db_rev, O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Can't open DB file $db_rev: $!"; $URL_STAT = tie %url_hash, 'DB_File', "$db_url", O_RDWR|O_CREAT, 0666, $DB_HASH or die "Can't open DB file $db_url: $!"; $CLEAN_STAT = tie %clean_hash, 'DB_File', "$db_clean", O_RDWR|O_CREAT, 0666, $DB_BTREE or die "Can't open DB file $db_clean: $!"; $CACHE_STAT = tie %cache_hash, 'DB_File', "$db_cache", O_RDWR|O_CREAT, 0666, $DB_HASH or die "Can't open DB file $db_cache: $!";}## close databases#sub untie_db { undef $REV_STAT; untie %rev_hash; undef $URL_STAT; untie %url_hash; undef $CLEAN_STAT; untie %clean_hash; undef $CACHE_STAT; untie %cache_hash;}## print current local time#sub print_time { my $now = localtime(); print "$now";}## log string#sub put_log { my $log = shift; print_time (); print "\t$log\n";}## recurse through broker's object directory and create database#sub parse_obj { my $OBJDIR = "objects"; my ($dir, $obj); opendir (DIR, "$OBJDIR"); while ($dir = readdir (DIR)) { next if (($dir eq "\.") or ($dir eq "\.\.")); opendir (SUBDIR, "$OBJDIR/$dir"); while ($obj = readdir (SUBDIR)) { next if (($obj eq "\.") or ($obj eq "\.\.")); process_obj ("$OBJDIR/$dir/$obj"); } closedir (SUBDIR); } closedir (DIR);}## process one object#sub process_obj { my $obj = shift; my ($url, $ttype, $ref, @ref_ary, $proto, $t); my ($host, $path, $file, $prev); open (IN, "<$obj") or die "Can't open $obj: $!"; ($url, $ttype, $ref) = extract_ref(); close (IN); chop $url; @ref_ary = sort (split /\n/, $ref); next if (($url eq "") || ($ttype eq "") || ($ref_ary[0] eq "") || (($ttype ne "HTML") && ($ttype ne "HTTP-Query"))); print "$obj\t$url\t$ttype\n"; # ($proto, $host, $path, $file) = $url =~ /(\S+):\/\/(\S+?)\/(.*?)\/(.*)/; ($proto, $t) = $url =~ /(\w+?):(.*)/; ($host) = $t =~ /\/\/(\S+)/; if ($host =~ /\//) { ($host, $t) = $host =~ /(\S+?)\/(.*)/; ($path, $file) = $t =~ /(.*)\/(.*)/; } $prev = ""; while ($_ = shift @ref_ary) { next if /^#/; s/(\S+)#.*/$1/; if (/\w+:\S+/) { $link = $_; } else { $link = "http://$host/$path/$_"; } $link = normal::normalize_uri ($link); next if ($prev eq $link); $prev = $link; build_db ($url, $link); }}## Build databases#sub build_db { my $url = shift; my $link = shift; $rev_hash {$link} = $url; $url_hash {$url} = 1; # any value, we need just exists $key}## Build a db with URL -> number of references to the URL.# This currently only counts direct hits only.#sub build_cache { my $key, $n; foreach $key (keys %url_hash) { $n = $REV_STAT->get_dup($key); print "$n\t$key\n"; $cache_hash {$key} = $n if ($n > 1); }}## Build a databse containing only URLs, which is also in the Broker.#sub clean_base { my @ref_ary; foreach $key (keys %url_hash) { if (exists $rev_hash {$key}) { print "Copying $key->\n"; @ref_ary = $REV_STAT->get_dup ($key); foreach (@ref_ary) { print "\t$_\n"; $clean_hash {$key} = $_; } } else { print "$key\tignored\n"; } }}## extract references from object#sub extract_ref { my ($ttype, $url, %SOIF) = soif_parse(); # We only need FILE. return if ($ttype ne "FILE"); return if (!exists $SOIF{'type'}); return if (!exists $SOIF{'url-references'}); return ("$url\n", $SOIF{'type'}, $SOIF{'url-references'});}## This is from soif.pl.# soif_parse - Returns an associative array containing the SOIF,# the template type, and the URL.#sub soif_parse { print "Inside soif_parse.\n" if ($debug); return () if (eof(IN)); # DW my $template_type = "UNKNOWN"; my $url = "UNKNOWN"; my %SOIF; undef %SOIF; my ($attr, $vsize, $value, $end_value); while (<IN>) { print "READING input line: $_\n" if ($debug); last if (/^\@\S+\s*{\s*\S+\s*$/o); } if (/^\@(\S+)\s*{\s*(\S+)\s*$/o) { $template_type = $1, $url = $2; } else { return ($template_type, $url, %SOIF); # done } while (<IN>) { if (/^\s*([^{]+){(\d+)}:\t(.*\n)/o) { $attr = $1; $vsize = $2; $value = $3; if (length($value) < $vsize) { $nleft = $vsize - length($value); $end_value = ""; $x = read(IN, $end_value, $nleft); die "Cannot read $nleft bytes: $!" if ($x != $nleft); $value .= $end_value; undef $end_value; } chop ($value) if ($value =~ /\n$/); $SOIF{$attr} = $value; undef $value; undef $end_value; next; } last if (/^}/o); } return ($template_type, $url, %SOIF);}=unused## return file name of a string#sub basename { my $path = shift; $path =~ /.*\/(.*)/; return $1;}## return directory name of a string#sub dirname { my $path = shift; $path =~ /(.*)\/.*/; return $1;}=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -