📄 lw2.pm
字号:
$T =~ tr/\0\r\n//d; next if ( length($T) == 0 ); next if ( $T =~ /^#/i ); # fragment push @{ $C->{referrers}->{$T} }, $ST[2] if ( $CONFIG->{save_referrers} > 0 ); if ( $T =~ /^([a-zA-Z0-9]*):/ && lc($1) ne 'http' && lc($1) ne 'https' ) { push @{ $C->{non_http}->{$T} }, $ST[2] if ( $CONFIG->{save_non_http} > 0 ); next; } if ( substr( $T, 0, 2 ) eq '//' && $CONFIG->{netloc_bug} > 0 ) { if ( $REQ->{whisker}->{ssl} > 0 ) { $T = 'https:' . $T; } else { $T = 'http:' . $T; } } if ( $CONFIG->{callback} != 0 ) { next if &{ $CONFIG->{callback} }( $T, $C ); } $T = uri_absolute( $T, $ST[4], $CONFIG->{normalize_uri} ); # (uri,protocol,host,port,params,frag,user,pass) @v = uri_split($T); # make sure URL is on same host and port if ( ( defined $v[2] && $v[2] ne $ST[0] ) || ( $v[3] > 0 && $v[3] != $ST[1] ) ) { $C->{offsites}->{ uri_join(@v) }++ if ( $CONFIG->{save_offsites} > 0 ); next; } if ( $v[0] =~ /\.([a-z0-9]+)$/i ) { if ( defined $CONFIG->{skip_ext}->{ lc($1) } ) { $TRACK->{ $v[0] } = '?' if ( $CONFIG->{save_skipped} > 0 ); next; } } if ( defined $v[4] && $CONFIG->{use_params} > 0 ) { $TRACK->{ $v[0] } = '?' if ( $CONFIG->{params_double_record} > 0 && !defined $TRACK->{ $v[0] } ); $v[0] = $v[0] . '?' . $v[4]; } next if ( defined $TRACK->{ $v[0] } ) ; # we've processed this already # ST[] = [ 0.HOST, 1.PORT, 2.URL, 3.DEPTH, 4.CWD, 5.REF ] push @$Q, [ $ST[0], $ST[1], $v[0], $ST[3] + 1, '', $ST[2] ]; } # foreach @$URLS = (); # reset for next round } # while return $COUNT; } # end sub crawl##################################################### sub _crawl_extract_links_test { my ( $TAG, $hr, $dr, $start, $len, $OBJ ) = ( lc(shift), @_ ); return undef if ( !scalar %$hr ); # fastpath quickie # we know this is defined, due to our tagmap my $t = $_crawl_linktags{$TAG}; while ( my ( $key, $val ) = each %$hr ) { # normalize element values $$hr{ lc($key) } = $val; } # all of this just to catch meta refresh URLs if ( $TAG eq 'meta' && defined $$hr{'http-equiv'} && $$hr{'http-equiv'} eq 'refresh' && defined $$hr{'content'} && $$hr{'content'} =~ m/url=(.+)/i ) { push( @{ $OBJ->{urls} }, $1 ); } elsif ( ref($t) ) { foreach (@$t) { push( @{ $OBJ->{urls} }, $$hr{$_} ) if ( defined $$hr{$_} ); } } else { push( @{ $OBJ->{urls} }, $$hr{$t} ) if ( defined $$hr{$t} ); } if ( $TAG eq 'form' && defined $$hr{action} ) { my $u = $OBJ->{response}->{whisker}->{uri}; $OBJ->{forms}->{ uri_absolute( $$hr{action}, $u, 1 ) }++; } return undef; }################################################################ sub _crawl_do_request_ex { my ( $hrin, $hrout, $OBJ ) = @_; my $ret; $ret = http_do_request( $hrin, $hrout ); return ( 2, $ret ) if ( $ret == 2 ); # if there was connection error, do not continue if ( $ret == 0 ) { # successful request # WARNING: what if *all* HEAD respones are 302'd on purpose, but # all GETs are normal? if ( $$hrout{whisker}->{code} < 308 && $$hrout{whisker}->{code} > 300 ) { if ( $OBJ->{config}->{follow_moves} > 0 ) { return ( 3, $ret ) if ( defined $$hrout{location} && !ref( $$hrout{location} ) ); } return ( 5, $ret ); # not avail } if ( $$hrout{whisker}->{code} == 200 ) { # no content-type is treated as text/htm if ( defined $$hrout{'content-type'} && $$hrout{'content-type'} !~ /^text\/htm/i ) { return ( 4, $ret ); } } } return ( -1, $ret ); # fallthrough }################################################################ sub _crawl_do_request { my ( $hrin, $hrout, $OBJ ) = @_; my ( $cret, $lwret ); if ( $OBJ->{config}->{do_head} && $$hrin{whisker}->{method} ne 'HEAD' ) { my $save = $$hrin{whisker}->{method}; $$hrin{whisker}->{method} = 'HEAD'; ( $cret, $lwret ) = _crawl_do_request_ex( $hrin, $hrout, $OBJ ); $$hrin{whisker}->{method} = $save; return $cret if ( $cret > 0 ); if ( $lwret == 0 ) { # successful request if ( $$hrout{whisker}->{code} == 501 ) { # HEAD not allowed $OBJ->{config}->{do_head} = 0; # no more HEAD requests } } # request errors are essentially redone via GET, below } ( $cret, $lwret ) = _crawl_do_request_ex( $hrin, $hrout, $OBJ ); return $lwret if ( $cret < 0 ); return $cret; }} # CRAWL_CONTAINER########################################################################################################################################=item B<dump>Params: $name, \@array [, $name, \%hash, $name, \$scalar ]Return: $code [ undef on error ]The dump function will take the given $name and data reference, andwill create an ASCII perl code representation suitable for eval'inglater to recreate the same structure. $name is the name of the variablethat it will be saved as. Example: $output = LW2::dump('request',\%request);NOTE: dump() creates anonymous structures under the name given. Forexample, if you dump the hash %hin under the name 'hin', then when youeval the dumped code you will need to use %$hin, since $hin is now a*reference* to a hash.=cutsub dump { my %what = @_; my ( $final, $k, $v ) = (''); while ( ( $k, $v ) = each %what ) { return undef if ( ref($k) || !ref($v) ); $final .= "\$$k = " . _dump( 1, $v, 1 ); $final =~ s#,\n$##; $final .= ";\n"; } return $final;}########################################################################=item B<dump_writefile>Params: $file, $name, \@array [, $name, \%hash, $name, \@scalar ]Return: 0 if success; 1 if errorThis calls dump() and saves the output to the specified $file. Note: LW does not checking on the validity of the file name, it'screation, or anything of the sort. Files are opened in overwritemode.=cutsub dump_writefile { my $file = shift; my $output = &dump(@_); return 1 if ( !open( OUT, ">$file" ) || !defined $output ); binmode(OUT); print OUT $output; close(OUT);}########################################################################sub _dump { # dereference and dump an element my ( $t, $ref, $depth ) = @_; my ( $out, $k, $v ) = (''); $depth ||= 1; # to protect against circular loops return 'undef' if ( $depth > 128 ); if ( !defined $ref ) { return 'undef'; } elsif ( ref($ref) eq 'HASH' ) { $out .= "{\n"; while ( ( $k, $v ) = each %$ref ) {# next if ( $k eq '' ); $out .= "\t" x $t; $out .= _dumpd($k) . ' => '; if ( ref($v) ) { $out .= _dump( $t + 1, $v, $depth + 1 ); } else { $out .= _dumpd($v); } $out .= ",\n" unless ( substr( $out, -2, 2 ) eq ",\n" ); } $out =~ s#,\n$#\n#; $out .= "\t" x ( $t - 1 ); $out .= "},\n"; } elsif ( ref($ref) eq 'ARRAY' ) { $out .= "["; if ( ~~@$ref ) { $out .= "\n"; foreach $v (@$ref) { $out .= "\t" x $t; if ( ref($v) ) { $out .= _dump( $t + 1, $v, $depth + 1 ); } else { $out .= _dumpd($v); } $out .= ",\n" unless ( substr( $out, -2, 2 ) eq ",\n" ); } $out =~ s#,\n$#\n#; $out .= "\t" x ( $t - 1 ); } $out .= "],\n"; } elsif ( ref($ref) eq 'SCALAR' ) { $out .= _dumpd($$ref); } elsif ( ref($ref) eq 'REF' ) { $out .= _dump( $t, $$ref, $depth + 1 ); } elsif ( ref($ref) ) { # unknown/unsupported ref $out .= "undef"; } else { # normal scalar $out .= _dumpd($ref); } return $out;}########################################################################sub _dumpd { # escape a scalar string my $v = shift; return 'undef' if ( !defined $v ); return "''" if ( $v eq '' ); return "$v" if ( $v eq '0' || $v !~ tr/0-9//c && $v !~ m#^0+# ); if ( $v !~ tr/ !-~//c ) { $v =~ s/(['\\])/\\$1/g; return "'$v'"; } $v =~ s#\\#\\\\#g; $v =~ s#"#\\"#g; $v =~ s#\r#\\r#g; $v =~ s#\n#\\n#g; $v =~ s#\t#\\t#g; $v =~ s#\$#\\\$#g; $v =~ s#([^!-~ ])#sprintf('\\x%02x',ord($1))#eg; return "\"$v\"";}################################################################################################################################################{ # package variables my $MIMEBASE64_TRYLOADING = 1;########################################################################=item B<encode_base64>Params: $data [, $eol]Return: $b64_encoded_dataThis function does Base64 encoding. If the binary MIME::Base64 moduleis available, it will use that; otherwise, it falls back to an internalperl version. The perl version carries the following copyright: Copyright 1995-1999 Gisle Aas <gisle@aas.no>NOTE: the $eol parameter will be inserted every 76 characters. This isused to format the data for output on a 80 character wide terminal.=cut sub encode_base64 { if ($MIMEBASE64_TRYLOADING) { eval "require MIME::Base64"; $MIMEBASE64_TRYLOADING = 0; } goto &MIME::Base64::encode_base64 if ($MIME::Base64::VERSION); my $res = ""; my $eol = $_[1]; $eol = "\n" unless defined $eol; pos( $_[0] ) = 0; while ( $_[0] =~ /(.{1,45})/gs ) { $res .= substr( pack( 'u', $1 ), 1 ); chop($res); } $res =~ tr|` -_|AA-Za-z0-9+/|; my $padding = ( 3 - length( $_[0] ) % 3 ) % 3; $res =~ s/.{$padding}$/'=' x $padding/e if $padding; if ( length $eol ) { $res =~ s/(.{1,76})/$1$eol/g; } $res; }########################################################################=item B<decode_base64>Params: $dataReturn: $b64_decoded_dataA perl implementation of base64 decoding. The perl code for this functionwas actually taken from an older MIME::Base64 perl module, and bears the following copyright:Copyright 1995-1999 Gisle Aas <gisle@aas.no>=cut sub decode_base64 { if ($MIMEBASE64_TRYLOADING) { eval "require MIME::Base64"; $MIMEBASE64_TRYLOADING = 0; } goto &MIME::Base64::decode_base64 if ($MIME::Base64::VERSION); my $str = shift; my $res = ""; $str =~ tr|A-Za-z0-9+=/||cd; $str =~ s/=+$//; # remove padding $str =~ tr|A-Za-z0-9+/| -_|; # convert to uuencoded format while ( $str =~ /(.{1,60})/gs ) { my $len = chr( 32 + length($1) * 3 / 4 ); # compute length byte $res .= unpack( "u", $len . $1 ); # uudecode } $res; }########################################################################} # end package variables########################################################################=item B<encode_uri_hex>Params: $dataReturn: $resultThis function encodes every character (except the / character) with normal URL hex encoding.=cutsub encode_uri_hex { # normal hex encoding my $str = shift; $str =~ s/([^\/])/sprintf("%%%02x",ord($1))/ge; return $str;}#########################################################################=item B<encode_uri_randomhex>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -