📄 lw2.pm
字号:
=item B<cookie_write>Params: $jar, \%request, $overrideReturn: nothingGoes through the given $jar and sets the Cookie header in %req pending the correct domain and path. If $override is true, then the secure, domain and path restrictions of the cookies are ignored and all cookies are essentiallyincluded.Notice: cookie expiration is currently not implemented. URL restrictioncomparision is also case-insensitive.=cutsub cookie_write { my ( $jarref, $hin, $override ) = @_; my ( $name, $out ) = ( '', '' ); return if ( !( defined $jarref && ref($jarref) ) ); return if ( !( defined $hin && ref($hin) ) ); $override = $override || 0; $$hin{'whisker'}->{'ssl'} = $$hin{'whisker'}->{'ssl'} || 0; foreach $name ( keys %$jarref ) { next if ( $name eq '' ); if($override){ $out .= "$name=$$jarref{$name}->[0];"; next; } next if ( $$hin{'whisker'}->{'ssl'} == 0 && $$jarref{$name}->[4] > 0 ); if ( $$hin{'whisker'}->{'host'} =~ /$$jarref{$name}->[1]$/i && $$hin{'whisker'}->{'uri'} =~ /^$$jarref{$name}->[2])/ ) { $out .= "$name=$$jarref{$name}->[0];"; } } if ( $out ne '' ) { $$hin{'Cookie'} = $out; }}########################################################################=item B<cookie_get>Params: $jar, $nameReturn: @elementsFetch the named cookie from the $jar, and return the components. Thereturned items will be an array in the following order:value, domain, path, expire, securevalue = cookie value, should always be non-empty stringdomain = domain root for cookie, can be undefinedpath = URL path for cookie, should always be a non-empty stringexpire = undefined (depreciated, but exists for backwards-compatibility)secure = whether or not the cookie is limited to HTTPs; value is 0 or 1=cutsub cookie_get { my ( $jarref, $name ) = @_; return undef if ( !( defined $jarref && ref($jarref) ) ); if ( defined $$jarref{$name} ) { return @{ $$jarref{$name} }; } return undef;}########################################################################=item B<cookie_get_names>Params: $jarReturn: @namesFetch all the cookie names from the jar, which then let you cooke_get()them individually.=cutsub cookie_get_names { my ( $jarref, $name ) = @_; return undef if ( !( defined $jarref && ref($jarref) ) ); return keys %$jarref;}########################################################################=item B<cookie_get_valid_names>Params: $jar, $domain, $url, $sslReturn: @namesFetch all the cookie names from the jar which are valid for the given$domain, $url, and $ssl values. $domain should be string scalar of thetarget host domain ('www.example.com', etc.). $url should be the absolute URL for the page ('/index.html', '/cgi-bin/foo.cgi', etc.). $ssl should be 0 for non-secure cookies, or 1 for all (secure and normal) cookies. The return value is an array of names compatible with cookie_get().=cutsub cookie_get_valid_names { my ( $jarref, $domain, $url, $ssl ) = @_; return () if ( !( defined $jarref && ref($jarref) ) ); return () if ( !defined $domain || $domain eq '' ); return () if ( !defined $url || $url eq '' ); $ssl ||= 0; my (@r, $name); foreach $name ( keys %$jarref ) { next if ( $name eq '' ); next if ( $$jarref{$name}->[4] > 0 && $ssl == 0 ); if ( $domain =~ /$$jarref{$name}->[1]$/i && $url =~ /^$$jarref{$name}->[2])/i ) { push @r, $name; } } return @r;}########################################################################=item B<cookie_set>Params: $jar, $name, $value, $domain, $path, $expire, $secureReturn: nothingSet the named cookie with the provided values into the %jar. $name is required to be a non-empty string. $value is required, and will deletethe named cookie from the $jar if it is an empty string. $domain and$path can be strings or undefined. $expire is ignored (but existsfor backwards-compatibility). $secure should be the numeric value of0 or 1.=cutsub cookie_set { my ( $jarref, $name, $value, $domain, $path, $expire, $secure ) = @_; my @construct; return if ( !( defined $jarref && ref($jarref) ) ); return if ( $name eq '' ); if ( !defined $value || $value eq '' ) { delete $$jarref{$name}; return; } $path = $path || '/'; $secure = $secure || 0; @construct = ( $value, $domain, $path, undef, $secure ); $$jarref{$name} = \@construct;}############################################################################################################################## cluster global variables%_crawl_config = ( 'save_cookies' => 0, 'reuse_cookies' => 1, 'save_offsites' => 0, 'save_non_http' => 0, 'follow_moves' => 1, 'url_limit' => 1000, 'use_params' => 0, 'params_double_record' => 0, 'skip_ext' => { gif => 1, jpg => 1, png => 1, gz => 1, swf => 1, pdf => 1, zip => 1, wav => 1, mp3 => 1, asf => 1, tgz => 1 }, 'save_skipped' => 0, 'save_referrers' => 0, 'use_referrers' => 1, 'do_head' => 0, 'callback' => 0, 'netloc_bug' => 1, 'normalize_uri' => 1, 'source_callback' => 0);%_crawl_linktags = ( 'a' => 'href', 'applet' => [qw(codebase archive code)], 'area' => 'href', 'base' => 'href', 'bgsound' => 'src', 'blockquote' => 'cite', 'body' => 'background', 'del' => 'cite', 'embed' => [qw(src pluginspage)], 'form' => 'action', 'frame' => [qw(src longdesc)], 'iframe' => [qw(src longdesc)], 'ilayer' => 'background', 'img' => [qw(src lowsrc longdesc usemap)], 'input' => [qw(src usemap)], 'ins' => 'cite', 'isindex' => 'action', 'head' => 'profile', 'layer' => [qw(background src)], 'link' => 'href', # 'meta' => 'http-equiv', 'object' => [qw(codebase data archive usemap)], 'q' => 'cite', 'script' => 'src', 'table' => 'background', 'td' => 'background', 'th' => 'background', 'xmp' => 'href',);#####################################################=item B<crawl_new>Params: $START, $MAX_DEPTH, \%request_hash [, \%tracking_hash ]Return: $crawl_objectThe crawl_new() functions initializes a crawl object (hash) to the defaultvalues, and then returns it for later use by crawl(). $START is the startingURL (in the form of 'http://www.host.com/url'), and MAX_DEPTH is the maximumnumber of levels to crawl (the START URL counts as 1, so a value of 2 willcrawl the START URL and all URLs found on that page). The request_hashis a standard initialized request hash to be used for requests; you shouldset any authentication information or headers in this hash in order forthe crawler to use them. The optional tracking_hash lets you supply ahash for use in tracking URL results (otherwise crawl_new() will allocatea new anon hash).=cutsub crawl_new { my ( $start, $depth, $reqref, $trackref ) = @_; my %X; return undef if ( !defined $start || !defined $depth ); return undef if ( !defined $reqref || !ref($reqref) ); $trackref = {} if ( !defined $trackref || !ref($trackref) ); $X{track} = $trackref; $X{request} = $reqref; $X{depth} = $depth || 2; $X{start} = $start; $X{magic} = 7340; $X{reset} = sub { $X{errors} = []; # all errors encountered $X{urls} = []; # temp; used to hold all URLs on page $X{server_tags} = {}; # all server tags found $X{referrers} = {}; # who refers to what URLs $X{offsites} = {}; # all URLs that point offsite $X{response} = {}; # temp; the response hash $X{non_http} = {}; # all non_http URLs found $X{cookies} = {}; # all cookies found $X{forms} = {}; # all forms found $X{jar} = {}; # temp; cookie jar $X{url_queue} = []; # temp; URLs to still fetch $X{config} = {}; %{ $X{config} } = %_crawl_config; %{ $X{track} } = (); $X{parsed_page_count} = 0; }; $X{crawl} = sub { crawl( \%X, @_ ) }; $X{reset}->(); return \%X;}#####################################################=item B<crawl>Params: $crawl_object [, $START, $MAX_DEPTH ]Return: $count [ undef on error ] The heart of the crawl package. Will perform an HTTP crawl on thespecified HOST, starting at START URI, proceeding up to MAX_DEPTH. Crawl_object needs to be the variable returned by crawl_new(). You canalso indirectly call crawl() via the crawl_object itself: $crawl_object->{crawl}->($START,$MAX_DEPTH)Returns the number of URLs actually crawled (not including those skipped).=cut{ # START OF CRAWL CONTAINER sub crawl { my ( $C, $START, $MAX_DEPTH ) = @_; return undef if ( !defined $C || !ref($C) || $C->{magic} != 7340 ); # shortcuts, to reduce dereferences and typing my $CONFIG = $C->{config}; my $TRACK = $C->{track}; my $URLS = $C->{urls}; my $RESP = $C->{response}; my $REQ = $C->{request}; my $Q = $C->{url_queue}; $START ||= $C->{start}; $C->{depth} = $MAX_DEPTH || $C->{depth}; my ( $COUNT, $T, @ST ) = ( 0, '' ); # ST[] = [ 0.HOST, 1.PORT, 2.URL, 3.DEPTH, 4.CWD, 5.REF ] my @v = uri_split($START); my $error = undef; $error = 'Start protocol not http or https' if ( $v[1] ne 'http' && $v[1] ne 'https' ); $error = 'Bad start host' if ( !defined $v[2] || $v[2] eq '' ); push( @{ $C->{errors} }, $error ) && return undef if ( defined $error ); @ST = ( $v[2], $v[3], $v[0], 1, '', '' ); $REQ->{whisker}->{ssl} = 1 if ( $v[1] eq 'https' ); $REQ->{whisker}->{host} = $ST[0]; $REQ->{whisker}->{port} = $ST[1]; $REQ->{whisker}->{lowercase_incoming_headers} = 1; $REQ->{whisker}->{ignore_duplicate_headers} = 0; delete $REQ->{whisker}->{parameters}; http_fixup_request($REQ); push @$Q, \@ST; while (@$Q) { @ST = @{ shift @$Q }; next if ( defined $TRACK->{ $ST[2] } && $TRACK->{ $ST[2] } ne '?' ); if ( $ST[3] > $C->{depth} ) { $TRACK->{ $ST[2] } = '?' if ( $CONFIG->{save_skipped} > 0 ); next; } $ST[4] = uri_get_dir( $ST[2] ); $REQ->{whisker}->{uri} = $ST[2]; if ( $ST[5] ne '' && $CONFIG->{use_referrers} > 0 ) { $REQ->{Referrer} = $ST[5]; } my $result = _crawl_do_request( $REQ, $RESP, $C ); if ( $result == 1 || $result == 2 ) { push @{ $C->{errors} }, "$ST[2]: $RESP->{whisker}->{error}"; next; } $COUNT++; $TRACK->{ $ST[2] } = $RESP->{whisker}->{code} if ( $result == 0 || $result == 4 ); $TRACK->{ $ST[2] } = '?' if ( ( $result == 3 || $result == 5 ) && $CONFIG->{save_skipped} > 0 ); if ( defined $RESP->{server} && !ref( $RESP->{server} ) ) { $C->{server_tags}->{ $RESP->{server} }++; } if ( defined $RESP->{'set-cookie'} ) { if ( $CONFIG->{save_cookies} > 0 ) { if ( ref( $RESP->{'set-cookie'} ) ) { $C->{cookies}->{$_}++ foreach ( @{ $RESP->{'set-cookie'} } ); } else { $C->{cookies}->{ $RESP->{'set-cookie'} }++; } } cookie_read( $C->{jar}, $RESP ) if ( $CONFIG->{reuse_cookies} > 0 ); } next if ( $result == 4 || $result == 5 ); next if ( scalar @$Q > $CONFIG->{url_limit} ); if ( $result == 0 ) { # page should be parsed if ( $CONFIG->{source_callback} != 0 && ref( $CONFIG->{source_callback} ) eq 'CODE' ) { &{ $CONFIG->{source_callback} }($C); } html_find_tags( \$RESP->{whisker}->{data}, \&_crawl_extract_links_test, 0, $C, \%_crawl_linktags ); $C->{parsed_page_count}++; } push @$URLS, $RESP->{location} if ( $result == 3 ); foreach $T (@$URLS) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -