📄 goolink.pl
字号:
#!/usr/bin/perl # ------------------------------------------------------------------------------------------------# v e r s i o n : ## murfie [ AT ] murfnet.xs4all.nl# v0.5.2 - Fri Jul 30 09:22:50 UTC 2004## p u r p o s e : ## * parse all the hyperlinks in a saved google search results page# * so they can be downloaded with 1 command (wget -i results.html)# * or they can be used with other scripts (hostlookup etc..)# * ignore links to back to google.com# * ignore links to cached documents on Google ## t e s t e d s y s t e m s :## * perl, v5.6.1 built for i386-linux (slackware 8.0)# * perl, v5.8.4 built for MSWin32-x86-multi-thread (Microsoft Windows XP [Version 5.1.2600])# Binary build 810 provided by ActiveState Corp. http://www.ActiveState.com## t e s t e d b r o w s e r s ## * firefox 0.8 on Linux# * msie 6.0.2800 on winxp## r e q u i r e d m o d u l e i n s t a l l a t i o n## 1) install required perl module URI::Find from cpan# on nix boxes this works like this ..## perl -MCPAN -e 'shell'# cpan> install URI::Find## on win32 try activestate's PPM to download and install it## D:\Tools\Perl>ppm## (note: errors about missing files starting ppm ? Reinstall Perl without PPM profiles)## PPM - Programmer's Package Manager version 3.1.# Copyright (c) 2001 ActiveState Corp. All Rights Reserved.# ActiveState is a devision of Sophos.# Entering interactive shell. Using Term::ReadLine::Stub as readline library.# Type 'help' to get started.## ppm> install URI-Find# ====================# Downloaded 6521 bytes.# Extracting 7/7: blib/arch/auto/URI/Find/.exists# Installing D:\Perl\html\site\lib\URI\Find.html# Installing D:\Perl\html\site\lib\URI\Find\Schemeless.html# Installing D:\Perl\site\lib\URI\Find.pm# Installing D:\Perl\site\lib\URI\Find\Schemeless.pm# Successfully installed URI-Find version 0.13 in ActivePerl 5.8.4.810.## ppm>## 2) download and rename file to goolink.pl (if needed)# chmod +x goolink.pl (on nix boxes)# cp goolink.pl to /usr/local/bin (or somewhere else in $PATH on nix boxes)## 3) do a google search and save it to a file ..# note: on windows systems, save results in HTML (only) format !!## 4) unix -> # ./goolink.pl -pv index.of.cgi.search.html# win32 -> D:\Tools\Perl>perl goolink.pl -pv index.of.cgi.search.html## r e m a r k s :## - Some vars are global and changed by subs, not pretty but it works for now ..# - Google's cached documents cannot be retrieved by wget, so they are ignored ..# - Do not try to write http regex yourself, the URI::Find module is completely RFC compliant.## t o d o :## - run the google query from goolink and save all results pages to 1 input file ## --------------------------------------------------------------------------------------------------require URI::Find; # http://search.cpan.org/~rosch/URI-Find-0.13/lib/URI/Find.pmuse strict; # standarduse warnings; # standarduse Getopt::Std; # standard# global settingsour %conf; # global configour @results; # global links collection (filled by addplainlink sub)our $out = 'results.txt'; # outfile with list of hyperlinks our $httpcounter = '0'; # file result counterour $httptotal = 0; # total result counter# get argsmy %args; # cmd line argsgetopts("cdfpv", \%args);if (defined ($args{"c"}) ) {$conf{"clean"} = 1 } else { $conf{"clean"} = 0 };if (defined ($args{"f"}) ) {$conf{"files"} = 1 } else { $conf{"files"} = 0 };if (defined ($args{"p"}) ) {$conf{"protocol"} = 1 } else { $conf{"protocol"} = 0 };if (defined ($args{"v"}) ) {$conf{"verbose"} = 1 } else { $conf{"verbose"} = 0 };if (defined ($args{"d"}) ) { $conf{"domain"} = 1; $conf{"protocol"} = 1; } else { $conf{"domain"} = 0 };# determine how we get input file(s)my @searchfiles;if ( (scalar @searchfiles eq 0) && (! defined(@ARGV)) ) { &usage() } if (scalar @searchfiles eq 0) { @searchfiles = (@ARGV) }# walk through input files# open magic URI search box :) .. my $sf; my $finder = URI::Find->new(\&addplaintextlink);foreach $sf (@searchfiles) { if (! (open (IN,"$sf")) ) { print "[-] warning: $sf not found !! (skipping file)\n"; next } print "[+] parsing $sf\n"; $httpcounter = 0; while (<IN>) { $finder->find(\$_) ; } print "[+] $sf contains $httpcounter http links\n"; $httptotal += $httpcounter; close IN;}# showem if we gottemif ($httptotal > 0) { open (OUT,">$out") || die "[-] no write permissions in this directory ?\n"; print "[+] writing total of $httptotal links to $out\n"; foreach my $link (sort (@results)) { print OUT "$link\n"; } print "[+] try 'wget -t 1 -T 10 -nv -i $out' to retrieve the files\n"; close OUT;} else { print "[-] sorry, no results\n";}# end script print "Done !\n";sub addplaintextlink() { my ($uri,$uri_orig) = @_; return if ($uri_orig =~ /google|cache/); # not foolproof i know.. fixme :) my $url = $uri_orig; # clean up left over <html> tags $url =~ s/<//; $url =~ s/>//; my ($link,$junk); if ($conf{"clean"}) { ($link,$junk) = split('\?',$url); if (defined($link)) { $url = $link } } if ($conf{"protocol"}) { ($junk,$link) = split('://',$url); if (defined($link)) { $url = $link } } if ($conf{"domain"}) { ($link,$junk) = split('/',$url); if ( defined($link) ) { $url = $link } } if ($conf{"files"}) { ($junk,$link) = split('.*/',$url); if ( defined($link) ) { $url = $link } } if ( ! defined($link) ) { $link = $url } push (@results, $link); ++$httpcounter; if ($conf{"verbose"}) { print "$httpcounter: $link\n" }}sub usage() { print qq~Usage: $0 [options] <filename> <..> <filename> -c clean (remove query part from URL) -d domains (show only domain and subdomains) -f filenames (show filenames only) -p no protocol (remove protocol from link) -v verbose (show every result on STDOUT) example: $0 -vd savedsearch.txt example: $0 s1.html s2.html example: $0 -c -p s*~; exit;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -