⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 soif2xml.pl.in

📁 harvest是一个下载html网页得机器人
💻 IN
字号:
#!@PERL@## kjl/30oct2003## soif2xml.pl##   Transform SOIF files to XML files.## Usage:##   soif2xml.pl [-H path]##     where path is the path to the broker, usually#     /usr/local/harvest/brokers/YOUR_BROKER##     You can invoke this directly at command line or#     embed it into glimpseindex script.## Edit below if necessary#use File::stat;# Default broker directory. This can be overriden by# "-H /your/path" argument.$BROKERDIR = "/usr/local/harvest/brokers/tengu.local";%unwanted = (	     "gatherer-host"    => 1,	     "gatherer-name"    => 1,	     "gatherer-version" => 1,	     "md5"              => 1,	     "refresh-rate"     => 1,	     "time-to-live"     => 1,	     "update-time"      => 1,	     "uri"              => 1,	     "url-references"   => 1	     );# End of configuration# No changes necessary below# Where is the broker directory$DATADIR = parse_arg ();if ($DATADIR eq "") {    $DATADIR = $BROKERDIR;}chdir $DATADIR or die "Can't chdir to $DATADIR: $!";# directory names$OBJDIR = "objects";       # directory containing objects$XMLDIR = "objects-xml";   # where to put XML files# create directories for XML files if necessaryput_log ("Preparing XML directory tree");(-d "$XMLDIR/0") ? clean_subdirs () : make_subdirs ();put_log ("XML directory tree ok");put_log ("Parsing objects");process_objs ();put_log ("Finished");# End of program## Parse arguments and fetch the broker directory#sub parse_arg {    my $i = 0;    my $datadir = "";    foreach (@ARGV) {	$i++;	last if /\-H/o;    }    $datadir = $ARGV[$i] if ($i <= $#ARGV);    return $datadir;}## print current local time#sub print_time {    my @now = localtime();    printf ("%d%02d%02d %02d:%02d:%02d",	    $now[5] + 1900, $now[4] + 1, $now[3], $now[2], $now[1], $now[0]);}## log string#sub put_log {    my $log = shift;    print "soif2xml.pl: ";    print_time ();    print ": $log\n";}## recurse through broker's XML directory and delete orphaned XML files#sub clean_subdirs {    my ($dir, $obj, $soif_obj);    opendir (DIR, "$XMLDIR");    while ($dir = readdir (DIR)) {	next if (($dir eq "\.") or ($dir eq "\.\."));	opendir (SUBDIR, "$XMLDIR/$dir");	while ($obj = readdir (SUBDIR)) {	    next if (($obj eq "\.") or ($obj eq "\.\."));	    $soif_obj = $obj;	    $soif_obj =~ s/\.xml$//g;	    if (! -f "$OBJDIR/$dir/$soif_obj") {		put_log ("Unlinking orphaned XML file $XMLDIR/$dir/$obj");		unlink "$XMLDIR/$dir/$obj" or die "Can't unlink $XMLDIR/$dir/$obj: $!";	    }	}	closedir (SUBDIR);    }    closedir (DIR);}## create directories for XML files if necessary#sub make_subdirs {    my $dir;    if (! -d $XMLDIR) {	mkdir ($XMLDIR)	    or die "Can't create directory $XMLDIR: $!";    }    if (! -d "$XMLDIR/0") {	opendir (DIR, "$OBJDIR");	while ($dir = readdir (DIR)) {	    next if (($dir eq "\.") or ($dir eq "\.\."));	    put_log ("Creating $XMLDIR/$dir");	    mkdir ("$XMLDIR/$dir")		or die "Can't create directory $XMLDIR/$dir: $!";	}	closedir (DIR);    }}## recurse through broker's object directory and create XML files#sub process_objs {    my ($dir, $obj);    opendir (DIR, "$OBJDIR");    while ($dir = readdir (DIR)) {	next if (($dir eq "\.") or ($dir eq "\.\."));	opendir (SUBDIR, "$OBJDIR/$dir");	while ($obj = readdir (SUBDIR)) {	    next if (($obj eq "\.") or ($obj eq "\.\."));	    soif2xml ("$dir/$obj");	}	closedir (SUBDIR);    }    closedir (DIR);}## remove empty lines and encode offending character#sub clean {    my $lines = shift;    # remove empty lines    $lines =~ s/\n\s+\n/\n/g;    # encode offending character    $lines =~ s/&/&amp;/g;    $lines =~ s/</&lt;/g;    $lines =~ s/>/&gt;/g;    return $lines;}## convert soif to xml#sub soif2xml {    my $obj = shift;    my ($ttype, $url, %SOIF);    my ($f_soif, $f_xml);    if (-f "$XMLDIR/$obj.xml") {	$f_soif = stat ("$OBJDIR/$obj");	$f_xml = stat ("$XMLDIR/$obj.xml");	if ($f_soif->mtime < $f_xml->mtime) {	    put_log ("Unchanged $XMLDIR/$obj.xml");	    return;	}    }    open (IN, "<$OBJDIR/$obj")	or die "Can't open $OBJDIR/$obj: $!";    ($ttype, $url, %SOIF) = soif_parse();    close (IN);    $SOIF{'url'} = $url;    put_log ("Writing $XMLDIR/$obj.xml");    open (OUT, ">$XMLDIR/$obj.xml")	or die "Can't open $XMLDIR/$obj.xml: $!";    print OUT "<xsoif>\n";    foreach $key (sort keys %SOIF) {	$SOIF{"$key"} = clean ($SOIF{"$key"});	print OUT "<$key>\n";	print OUT $SOIF{"$key"}, "\n";	print OUT "</$key>\n";    }    print OUT "</xsoif>\n";    close (OUT);}##  This is from soif.pl.#  soif_parse - Returns an associative array containing the SOIF,#		the template type, and the URL.#sub soif_parse {        return () if (eof(IN));       # DW	my $template_type = "UNKNOWN";	my $url = "UNKNOWN";	my %SOIF;	undef %SOIF;	my ($attr, $vsize, $value, $end_value, $l, $x);	while (<IN>) {		last if (/^\@\S+\s*{\s*\S+\s*$/o);	}	if (/^\@(\S+)\s*{\s*(\S+)\s*$/o) {		$template_type = $1, $url = $2;	} else {		return ($template_type, $url, %SOIF);	# done	}	while (<IN>) {                if (/^\s*([^{]+){(\d+)}:\t(.*\n)/o) {			$attr = $1;			$vsize = $2;			$value = $3;			$l = length($value);			if ($l < $vsize) {				$nleft = $vsize - $l;				if (exists $unwanted{$attr}) {					seek(IN, $nleft, 1) ||						die "Cannot seek $nleft bytes: $!";				} else {					$end_value = "";					$x = read(IN, $end_value, $nleft);					die "Cannot read $nleft bytes: $!"						if ($x != $nleft);					$value .= $end_value;					undef $end_value;					chop ($value) if ($value =~ /\n$/);					$SOIF{$attr} = $value;				}			} else {				if (!exists $unwanted{$attr}) {					chop ($value) if ($value =~ /\n$/);					$SOIF{$attr} = $value;				}			}			undef $value;			next;		}		last if (/^}/o);	}	return ($template_type, $url, %SOIF);}=unused## return file name of a string#sub basename {    my $path = shift;    $path =~ /.*\/(.*)/;    return $1;}## return directory name of a string#sub dirname {    my $path = shift;    $path =~ /(.*)\/.*/;    return $1;}=cut

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -