📄 lw.pm
字号:
Params: \%jar, \%houtReturn: $num_of_cookies_readRead in cookies from an %hout hash (HTTP response), and put them in %jar.=cutsub cookie_read { my ($count,$jarref,$href)=(0,@_); return 0 if(!(defined $jarref && ref($jarref))); return 0 if(!(defined $href && ref($href) )); my $target = utils_find_lowercase_key($href,'set-cookie'); if(!defined $target){ return 0;} if(ref($target)){ # multiple headers foreach (@{$target}){ cookie_parse($jarref,$_); $count++; } } else { # single header cookie_parse($jarref,$target); $count=1; } return $count;}########################################################################=pod =head1 - Function: LW::cookie_parse Params: \%jar, $cookieReturn: nothingParses the cookie into the various parts and then sets the appropriate values in the %jar under the name; if the cookie is blank, it will delete it from the jar.=cutsub cookie_parse { my ($jarref, $header)=@_; my ($del,$part,@parts,@construct,$cookie_name)=(0); return if(!(defined $jarref && ref($jarref))); return if(!(defined $header && length($header)>0)); @parts=split(/;/,$header); foreach $part (@parts){ if($part=~/^[ \t]*(.+?)=(.*)$/){ my ($name,$val)=($1,$2); if($name=~/^domain$/i){ $val=~s#^http://##; $val=~s#/.*$##; $construct[1]=$val; } elsif($name=~/^path$/i){ $val=~s#/$## if($val ne '/'); $construct[2]=$val; } elsif($name=~/^expires$/i){ $construct[3]=$val; } else { $cookie_name=$name; if($val eq ''){ $del=1; } else { $construct[0]=$val;} } } else { if($part=~/secure/){ $construct[4]=1;} } } if($del){ delete $$jarref{$cookie_name} if defined $$jarref{$cookie_name}; } else { $$jarref{$cookie_name}=\@construct; }}########################################################################=pod =head1 - Function: LW::cookie_write Params: \%jar, \%hin, $overrideReturn: nothingGoes through the given jar and sets the Cookie header in %hin pending the correct domain and path. If $override is true, then the domain and pathrestrictions of the cookies are ignored.Todo: factor in expire and secure.=cutsub cookie_write { my ($jarref, $hin, $override)=@_; my ($name,$out)=('',''); return if(!(defined $jarref && ref($jarref))); return if(!(defined $hin && ref($hin) )); $override=$override||0; $$hin{'whisker'}->{'ssl'}=$$hin{'whisker'}->{'ssl'}||0; foreach $name (keys %$jarref){ next if($name eq ''); next if($$hin{'whisker'}->{'ssl'}==0 && $$jarref{$name}->[4]>0); if($override || ($$hin{'whisker'}->{'host'}=~/$$jarref{$name}->[1]$/i && $$hin{'whisker'}->{'uri'}=~/$$jarref{$name}->[2]/i)){ $out.="$name=$$jarref{$name}->[0];"; } } if($out ne ''){ $$hin{'Cookie'}=$out; }}########################################################################=pod =head1 - Function: LW::cookie_get Params: \%jar, $nameReturn: @elementsFetch the named cookie from the jar, and return the components.=cutsub cookie_get { my ($jarref,$name)=@_; return undef if(!(defined $jarref && ref($jarref))); if(defined $$jarref{$name}){ return @{$$jarref{$name}};} return undef;}########################################################################=pod =head1 - Function: LW::cookie_set Params: \%jar, $name, $value, $domain, $path, $expire, $secureReturn: nothingSet the named cookie with the provided values into the %jar.=cutsub cookie_set { my ($jarref,$name,$value,$domain,$path,$expire,$secure)=@_; my @construct; return if(!(defined $jarref && ref($jarref))); return if($name eq ''); if($value eq ''){ delete $$jarref{$name}; return;} $path=$path||'/'; $secure=$secure||0; @construct=($value,$domain,$path,$expire,$secure); $$jarref{$name}=\@construct; }########################################################################=pod=head1 ++ Sub package: crawlUsed for crawling a website by requesting a (start) page, reading theHTML, extracting the links, and then requesting those links--up to aspecified depth. The module also allows various configuration tweaks todo such things as monitor requests for offsite URLs (pages on otherhosts), track various cookies, etc.=cut#####################################################=pod=head1 - Function: LW::crawl Params: $START, $MAX_DEPTH, \%tracking, \%hinReturn: NothingThe heart of the crawl package. Will perform an HTTP crawl on thespecified HOST, starting at START URI, proceeding up to MAX_DEPTH. Atracking hash reference (required) stores the results of each page (andongoing progress). The http_in_options hash reference specifies astandard HTTP hash for use in the outgoing HTTP requests. Certain optionsare configurable via LW::crawl_set_config(). The tracking hash willcontain all the pages visited; you can get the crawl engine to skip pagesby placing them in the tracking hash ahead of time.START (first) parameter should be of the form "http://www.host.com/url".=cutsub crawl { my ($START, $MAX_DEPTH, $hrtrack, $hrin)=@_; my (%hout, %jar); my ($T, @ST, @links, @tlinks, @vals, @ERRORS)=(''); return if(!(defined $hrtrack && ref($hrtrack))); return if(!(defined $hrin && ref($hrin) )); return if(!defined $START || length($START)==0); $MAX_DEPTH||=2; # $ST[0]=HOST $ST[1]=URL $ST[2]=CWD $ST[3]=HTTPS $ST[4]=SERVER # $ST[5]=PORT $ST[6]=DEPTH @vals=utils_split_uri($START); $ST[1]=$vals[0]; # uri $ST[0]=$vals[2]; # host $ST[5]=$vals[3]; # port $ST[4]=undef; # server tag return if($ST[0] eq ''); # some various informationz... $LW::crawl_config{'host'}=$ST[0]; $LW::crawl_config{'port'}=$ST[5]; $LW::crawl_config{'start'}=$ST[1]; $$hrin{'whisker'}->{'host'}=$ST[0]; $$hrin{'whisker'}->{'port'}=$ST[5]; $$hrin{'whisker'}->{'lowercase_incoming_headers'}=1; # makes life easier http_fixup_request($hrin); # this is so callbacks can access internals via references $LW::crawl_config{'ref_links'}=\@links; $LW::crawl_config{'ref_jar'}=\%jar; $LW::crawl_config{'ref_hin'}=$hrin; $LW::crawl_config{'ref_hout'}=\%hout; %LW::crawl_referrers=(); # empty out existing referrers %LW::crawl_server_tags=(); %LW::crawl_offsites=(); %LW::crawl_cookies=(); %LW::crawl_forms=(); push @links, \@{[$ST[1],1,($vals[1] eq 'https')?1:0]}; while(@links){ my $C=shift @links; $ST[1]=$C->[0]; # url $ST[6]=$C->[1]; # depth $ST[3]=$C->[2]; # https next if(defined $$hrtrack{$ST[1]} && $$hrtrack{$ST[1]} ne '?'); if($ST[6] > $MAX_DEPTH){ $$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0); next; } $ST[2]=utils_get_dir($ST[1]); $$hrin{'whisker'}->{'uri'}=$ST[1]; $$hrin{'whisker'}->{'ssl'}=$ST[3]; my $result = crawl_do_request($hrin,\%hout); if($result==1 || $result==2){ push @ERRORS, "Error on making request for '$ST[1]': $hout{'whisker'}->{'error'}"; next; } if($result==0 || $result==4){ $$hrtrack{$ST[1]}=$hout{'whisker'}->{'http_resp'}; } if($result==3 || $result==5){ $$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0); } if(defined $hout{'server'}){ if(!defined $ST[4]){ # server tag $ST[4]=$hout{'server'}; } $LW::crawl_server_tags{$hout{'server'}}++; } if(defined $hout{'set-cookie'}){ if($LW::crawl_config{'save_cookies'}>0){ if(ref($hout{'set-cookie'})){ foreach (@{$hout{'set-cookie'}}){ $LW::crawl_cookies{$_}++; } } else { $LW::crawl_cookies{$hout{'set-cookie'}}++; } } if($LW::crawl_config{'reuse_cookies'}>0){ cookie_read(\%jar,\%hout); } } next if($result==4 || $result==5); next if(scalar @links > $LW::crawl_config{'url_limit'}); if($result==0){ # page should be parsed if($LW::crawl_config{'source_callback'} != 0 && ref($LW::crawl_config{'source_callback'})){ &{$LW::crawl_config{'source_callback'}}($hrin,\%hout); } LW::html_find_tags(\$hout{'whisker'}->{'data'}, \&crawl_extract_links_test); $LW::crawl_config{'stats_html'}++; # count how many pages we've parsed } if($result==3){ # follow the move via location header push @LW::crawl_urls, $hout{'location'}; } foreach $T (@LW::crawl_urls){ $T=~tr/\0\r\n//d; # the NULL character is a bug that's somewhere next if (length($T)==0); next if ($T=~/^javascript:/i); # stupid javascript next if ($T=~/^mailto:/i); next if ($T=~m#^([a-zA-Z]*)://# && lc($1) ne 'http' && lc($1) ne 'https'); next if ($T=~/^#/i); # fragment if($LW::crawl_config{'callback'} != 0){ next if &{$LW::crawl_config{'callback'}}($T,@ST); } push(@{$LW::crawl_referrers{$T}}, $ST[1]) if( $LW::crawl_config{'save_referrers'}>0 ); $T=utils_absolute_uri($T,$ST[1],1) if($LW::crawl_config{'normalize_uri'}>0); @vals=utils_split_uri($T); # slashdot bug: workaround for the following fsck'd html code: # <FORM ACTION="//slashdot.org/users.pl" METHOD="GET"> if($LW::crawl_config{'slashdot_bug'} > 0 && substr($vals[0],0,2) eq '//'){ if($ST[3]==1){ $T='https:'.$T; } else { $T='http:' .$T; } @vals=utils_split_uri($T); } # make sure URL is on same host, port, and protocol if( (defined $vals[2] && $vals[2] ne $ST[0]) || (defined $vals[3] && $vals[3] != $ST[5]) || (defined $vals[1] && ($vals[1] ne 'http' && $vals[1] ne 'https'))){ if($LW::crawl_config{'save_offsites'}>0){ $LW::crawl_offsites{utils_join_uri(@vals)}++; } next; } if(substr($vals[0],0,1) ne '/'){ $vals[0]=$ST[2].$vals[0]; } my $where=rindex($vals[0],'.'); my $EXT=''; if($where >= 0){ $EXT = substr($vals[0], $where+1, length($vals[0])-$where); } $EXT=~tr/0-9a-zA-Z//cd; # yucky chars will puke regex below if($EXT ne '' && $LW::crawl_config{'skip_ext'}=~/\.$EXT /i){ if($LW::crawl_config{'save_skipped'}>0){ $$hrtrack{$vals[0]}='?'; } next; } if(defined $vals[4] && $LW::crawl_config{'use_params'}>0){ if($LW::crawl_config{'params_double_record'}>0 && !defined $$hrtrack{$vals[0]}){ $$hrtrack{$vals[0]}='?'; } $vals[0]=$vals[0].'?'.$vals[4]; } next if(defined $$hrtrack{$vals[0]}); push @links, \@{[$vals[0],$ST[6]+1, ($vals[1] eq 'https')?1:0]}; } # foreach @LW::crawl_urls=(); # reset for next round } # while my $key; foreach $key (keys %LW::crawl_config){ delete $LW::crawl_config{$key} if (substr($key,0,4) eq 'ref_');} $LW::crawl_config{'stats_reqs'}=$hout{'whisker'}->{'stats_reqs'}; $LW::crawl_config{'stats_syns'}=$hout{'whisker'}->{'stats_syns'};} # end sub crawl#####################################################=pod=head1 - Function: LW::crawl_get_config Params: $config_directiveReturn: $config_directive_valueReturns the set value of the submitted config_directive. SeeLW::crawl_set_config() for a list of configuration values.=cutsub crawl_get_config { my $key=shift; return $LW::crawl_config{$key};}#####################################################=pod=head1 - Function: LW::crawl_set_config Params: $config_directive, $valueReturn: NothingThis function adjusts the configuration of the crawl package. Use valuesof 0 and 1 for off and on, respectively. The defaults are set in libs/globals.wpl.save_cookies- crawl will save all cookies encountered, for later reviewsave_offsite_urls- crawl will save all offsite URLs (URLs not on this host); crawl will not actually crawl those hosts (use separate calls to crawl)follow_moves- crawl will follow the URL received from an HTTP move responseuse_params- crawl will factor in URI parameters when considering if a URI is unique or notparams_double_record- if both use_params and params_double_record are set, crawl will make two entries for each URI which has paramaters: one with and one without the parametersreuse_cookies- crawl will resubmit any received/prior cookiesskip_ext- crawl will ignore requests for URLs ending in extensions given; the value requires a specific string format: (dot)extension(space). For example, to ignore GIFs and JPGs, you would run: LW::crawl_set_config('skip_ext',".gif .jpg ");save_skipped- any URLs that are skipped via skip_ext, or are above the specified DEPTH will be recorded in the tracking hash with a value of '?' (instead of an
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -