⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 soif2gils.pl.in

📁 harvest是一个下载html网页得机器人
💻 IN
字号:
#!@PERL@## kjl/26apr2003## soif2gils.pl##   Translate SOIF files to GILS files.## Usage:##   soif2gils.pl [-H path]##     where path is the path to the broker, usually#     /usr/local/harvest/brokers/YOUR_BROKER##     You can invoke this directly at command line or#     embed it into glimpseindex script.## Edit below if necessary##$DEBUG = 1;# Default broker directory. This can be overriden by# "-H /your/path" argument.$BROKERDIR = "/usr/local/harvest/brokers/tab-server.tab";# End of configuration# No changes necessary below# Where is the broker directory$DATADIR = parse_arg ();if ($DATADIR eq "") {    $DATADIR = $BROKERDIR;}chdir $DATADIR or die "Can't chdir to $DATADIR: $!";# directory names$OBJDIR  = "objects";      # directory containing objects$GILSDIR = "objects-xml";  # where to put GILS files# create directories for the GILS files if necessaryput_log ("Preparing GILS directory tree");make_subdirs ();put_log ("GILS directory tree ok");put_log ("Parsing objects");process_objs ();put_log ("Finished");# End of program## Parse arguments and fetch the broker directory#sub parse_arg {    my $i = 0;    my $datadir = "";    foreach (@ARGV) {	$i++;	last if /\-H/o;    }    $datadir = $ARGV[$i] if ($i <= $#ARGV);    return $datadir;}## print current local time#sub print_time {    my $now = localtime();    print "$now";}## log string#sub put_log {    my $log = shift;    print_time ();    print "\t$log\n";}## create directories for GILS files if necessary#sub make_subdirs {    my $dir;    if (! -d $GILSDIR) {	mkdir ($GILSDIR)	    or die "Can't create directory $GILSDIR: $!";    }    if (! -d "$GILSDIR/0") {	opendir (DIR, "$OBJDIR");	while ($dir = readdir (DIR)) {	    next if (($dir eq "\.") or ($dir eq "\.\."));	    put_log ("Creating $GILSDIR/$dir");	    mkdir ("$GILSDIR/$dir")		or die "Can't create directory $GILSDIR/$dir: $!";	}	closedir (DIR);    }}## recurse through broker's object directory and create GILS files#sub process_objs {    my ($dir, $obj);    opendir (DIR, "$OBJDIR");    while ($dir = readdir (DIR)) {	next if (($dir eq "\.") or ($dir eq "\.\."));	opendir (SUBDIR, "$OBJDIR/$dir");	while ($obj = readdir (SUBDIR)) {	    next if (($obj eq "\.") or ($obj eq "\.\."));	    soif2gils ("$dir/$obj");	}	closedir (SUBDIR);    }    closedir (DIR);}## remove empty lines and encode offending character#sub clean {    my $lines = shift;    # remove empty lines    $lines =~ s/\n\s+\n/\n/g;    # encode offending character    $lines =~ s/</&lt;/g;    return $lines;}## convert soif to gils#sub soif2gils {    my $obj = shift;    my ($ttype, $url, %SOIF);    open (IN, "<$OBJDIR/$obj")	or die "Can't open $OBJDIR/$obj: $!";    ($ttype, $url, %SOIF) = soif_parse();    close (IN);    foreach $key (keys %SOIF) {	$SOIF{"$key"} = clean ($SOIF{"$key"});    }    put_log ("Writing $GILSDIR/$obj.gils");    open (OUT, ">$GILSDIR/$obj.gils")	or die "Can't open $GILSDIR/$obj.gils: $!";    print OUT "<gils>\n\n";    print OUT "<availability>\n";    print OUT "  <linkage>\n";    print OUT "    $url\n";    print OUT "  </linkage>\n";    #print OUT "  <linkageType>\n";    #print OUT "    $ttype\n";    #print OUT "  </linkageType>\n";    print OUT "</availability>\n\n";    print OUT "<dateOfLastModification>\n";    print OUT "  ", $SOIF{'last-modification-time'}, "\n";    print OUT "</dateOfLastModification>\n\n";    #print OUT "<controlIdentifier>\n";    #print OUT "  ", $SOIF{'md5'}, "\n";    #print OUT "</controlIdentifier>\n\n";    print OUT "<abstract>\n";    print OUT "  ", $SOIF{'description'}, "\n";    print OUT "</abstract>\n\n";    print OUT "<author>\n";    print OUT "  ", $SOIF{'author'}, "\n";    print OUT "</author>\n\n";    print OUT "<localSubjectIndex>\n";    print OUT "  <localSubjectTerm>\n";    print OUT "  ", $SOIF{'keywords'}, "\n";    print OUT "  </localSubjectTerm>\n";    print OUT "</localSubjectIndex>\n\n";    print OUT "<supplementalInformation>\n";    print OUT "  <bytes>\n";    print OUT "  ", $SOIF{'file-size'}, "\n";    print OUT "  </bytes>\n";    #print OUT "  <lastChecked>\n";    #print OUT "  ", $SOIF{'update-time'}, "\n";    #print OUT "  </lastChecked>\n";    print OUT "</supplementalInformation>\n\n";    print OUT "<crossReference>\n";    print OUT "  <linkage>\n";    print OUT "  ", $SOIF{'url-references'}, "\n";    print OUT "  </linkage>\n";    print OUT "</crossReference>\n\n";    print OUT "<title>\n";    print OUT "  ", $SOIF{'title'}, "\n";    print OUT "</title>\n\n";    #print OUT "<sampleText>\n";    print OUT "<Body-of-text>\n";    print OUT $SOIF{'body'},         "\n" if exists $SOIF{'body'};    print OUT $SOIF{'full-text'},    "\n" if exists $SOIF{'full-text'};    print OUT $SOIF{'partial-text'}, "\n" if exists $SOIF{'partial-text'};    print OUT "</Body-of-text>\n\n";    #print OUT "</sampleText>\n\n";    print OUT "</gils>\n";    close (OUT);}##  This is from soif.pl.#  soif_parse - Returns an associative array containing the SOIF,#		the template type, and the URL.#sub soif_parse {	print "Inside soif_parse.\n" if ($debug);        return () if (eof(IN));       # DW	my $template_type = "UNKNOWN";	my $url = "UNKNOWN";	my %SOIF;	undef %SOIF;	my ($attr, $vsize, $value, $end_value);	while (<IN>) {		print "READING input line: $_\n" if ($debug);		last if (/^\@\S+\s*{\s*\S+\s*$/o);	}	if (/^\@(\S+)\s*{\s*(\S+)\s*$/o) {		$template_type = $1, $url = $2;	} else {		return ($template_type, $url, %SOIF);	# done	}	while (<IN>) {                if (/^\s*([^{]+){(\d+)}:\t(.*\n)/o) {			$attr = $1;			$vsize = $2;			$value = $3;			if (length($value) < $vsize) {				$nleft = $vsize - length($value);				$end_value = "";				$x = read(IN, $end_value, $nleft);				die "Cannot read $nleft bytes: $!"					if ($x != $nleft);				$value .= $end_value;				undef $end_value;			}                        chop ($value) if ($value =~ /\n$/);                        $SOIF{$attr} = $value;			undef $value;			undef $end_value;			next;		}		last if (/^}/o);	}	return ($template_type, $url, %SOIF);}=unused## return file name of a string#sub basename {    my $path = shift;    $path =~ /.*\/(.*)/;    return $1;}## return directory name of a string#sub dirname {    my $path = shift;    $path =~ /(.*)\/.*/;    return $1;}=cut

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -