📄 lw2.pm
字号:
if ( scalar %$hr ) { while ( my ( $key, $val ) = each %$hr ) { if ( $key =~ tr/A-Z// ) { delete $$hr{$key}; if ( defined $val ) { $$hr{ lc($key) } = $val; } else { $$hr{ lc($key) } = undef; } } } } if ( $TAG eq 'form' ) { if ( scalar %$_forms_CURRENT ) { # save last form push( @$_forms_FOUND, $_forms_CURRENT ); $ar->[0] = {}; $_forms_CURRENT = $ar->[0]; } $_forms_CURRENT->{"\0"} = [ $$hr{name}, $$hr{method}, $$hr{action}, [] ]; delete $$hr{'name'}; delete $$hr{'method'}; delete $$hr{'action'}; $key = "\0"; $UNKNOWNS = 0; } elsif ( $TAG eq 'input' ) { $$hr{type} = 'text' if ( !defined $$hr{type} ); $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $$hr{value} = undef if ( !defined $$hr{value} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'input-' . $$hr{type}, $$hr{value}, [] ]; delete $$hr{'name'}; delete $$hr{'type'}; delete $$hr{'value'}; } elsif ( $TAG eq 'select' ) { $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'select', undef, [] ]; $CURRENT_SELECT = $key; delete $$hr{name}; } elsif ( $TAG eq '/select' ) { push @{ $_forms_CURRENT->{$CURRENT_SELECT} }, [ '/select', undef, [] ]; $CURRENT_SELECT = undef; return undef; } elsif ( $TAG eq 'option' ) { return undef if ( !defined $CURRENT_SELECT ); if ( !defined $$hr{value} ) { my $stop = index( $$dr, '<', $start + $len ); return undef if ( $stop == -1 ); # MAJOR PUKE $$hr{value} = substr( $$dr, $start + $len, ( $stop - $start - $len ) ); $$hr{value} =~ tr/\r\n//d; } push @{ $_forms_CURRENT->{$CURRENT_SELECT} }, [ 'option', $$hr{value}, [] ]; delete $$hr{value}; } elsif ( $TAG eq 'textarea' ) { my $stop = $start + $len; $$hr{value} = $$hr{'='}; delete $$hr{'='}; $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'textarea', $$hr{value}, [] ]; delete $$hr{'name'}; delete $$hr{'value'}; } else { # button $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $$hr{value} = undef if ( !defined $$hr{value} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'button', $$hr{value}, [] ]; delete $$hr{'name'}; delete $$hr{'value'}; } if ( scalar %$hr ) { if ( $TAG eq 'form' ) { $parr = $_forms_CURRENT->{$key}->[3]; } else { $parr = $_forms_CURRENT->{$key}->[-1]; $parr = $parr->[2]; } my ( $k, $v ); while ( ( $k, $v ) = each(%$hr) ) { if ( defined $v ) { push @$parr, "$k=\"$v\""; } else { push @$parr, $k; } } } return undef; }}################################################################=item B<html_find_tags>Params: \$data, \&callback_function [, $xml_flag, $funcref, \%tag_map]Return: nothinghtml_find_tags parses a piece of HTML and 'extracts' all found tags,passing the info to the given callback function. The callback function must accept two parameters: the current tag (as a scalar), and a hash ref of all the tag's elements. For example, the tag <a href="/file"> willpass 'a' as the current tag, and a hash reference which contains{'href'=>"/file"}.The xml_flag, when set, causes the parser to do some extra processingand checks to accomodate XML style tags such as <tag foo="bar"/>.The optional %tagmap is a hash of lowercase tag names. If a tagmap issupplied, then the parser will only call the callback function if thetag name exists in the tagmap.The optional $funcref variable is passed straight to the callbackfunction, allowing you to pass flags or references to more complexstructures to your callback function.=cut{ # contained variables $DR = undef; # data reference $c = 0; # parser pointer $LEN = 0; sub html_find_tags { my ( $dataref, $callbackfunc, $xml, $fref, $tagmap ) = @_; return if ( !( defined $dataref && ref($dataref) ) ); return if ( !( defined $callbackfunc && ref($callbackfunc) ) ); $xml ||= 0; my ( $INTAG, $CURTAG, $LCCURTAG, $ELEMENT, $VALUE, $cc ) = (0); my ( %TAG, $ret, $start, $tagstart, $tempstart, $x, $found ); my $usetagmap = ( ( defined $tagmap && ref($tagmap) ) ? 1 : 0 ); $CURTAG = $LCCURTAG = $ELEMENT = $VALUE = $cc = ''; $DR = $dataref; $LEN = length($$dataref); for ( $c = 0 ; $c < $LEN ; $c++ ) { $cc = substr( $$dataref, $c, 1 ); next if ( !$INTAG && $cc ne '>' && $cc ne '<' ); if ( $cc eq '<' ) { if ($INTAG) { # we're already in a tag... # we trick the parser into thinking we end cur tag $cc = '>'; $c--; } elsif ($xml && $LEN > ( $c + 9 ) && substr( $$dataref, $c + 1, 8 ) eq '![CDATA[' ) { $c += 9; $tempstart = $c; $found = index( $$dataref, ']]>', $c ); $c = $found + 2; $c = $LEN if ( $found < 0 ); # malformed XML # what to do with CDATA? next; } elsif ( $LEN > ( $c + 3 ) && substr( $$dataref, $c + 1, 3 ) eq '!--' ) { $tempstart = $c; $c += 4; $found = index( $$dataref, '-->', $c ); if ( $found < 0 ) { $found = index( $$dataref, '>', $c ); $found = $LEN if ( $found < 0 ); $c = $found; } else { $c = $found + 2; } if ( $usetagmap == 0 || defined $tagmap->{'!--'} ) { my $dat = substr( $$dataref, $tempstart + 4, $found - $tempstart - 4 ); &$callbackfunc( '!--', { '=' => $dat }, $dataref, $tempstart, $c - $tempstart + 1, $fref ); } next; } elsif ( !$INTAG ) { next if ( substr( $$dataref, $c + 1, 1 ) =~ tr/ \t\r\n// ); $c++; $INTAG = 1; $tagstart = $c - 1; $CURTAG = ''; while ( $c < $LEN && ( $x = substr( $$dataref, $c, 1 ) ) !~ tr/ \t\r\n>=// ) { $CURTAG .= $x; $c++; } chop $CURTAG if ( $xml && substr( $CURTAG, -1, 1 ) eq '/' ); $c++ if ( defined $x && $x ne '>' ); $LCCURTAG = lc($CURTAG); $INTAG = 0 if ( $LCCURTAG !~ tr/a-z0-9// ); next if ( $c >= $LEN ); $cc = substr( $$dataref, $c, 1 ); } } if ( $cc eq '>' ) { next if ( !$INTAG ); if ( $LCCURTAG eq 'script' && !$xml ) { $tempstart = $c + 1; pos($$dataref) = $c; if ( $$dataref !~ m#(</script.*?>)#ig ) { # what to do if closing script not found? # right now, we'll just leave the tag alone; # this won't affect the 'absorption' of the # javascript code (and thus, affect parsing) } else { $c = pos($$dataref) - 1; my $l = length($1); $TAG{'='} = substr( $$dataref, $tempstart, $c - $tempstart - $l + 1 ); } } elsif ( $LCCURTAG eq 'textarea' && !$xml ) { $tempstart = $c + 1; pos($$dataref) = $c; if ( $$dataref !~ m#(</textarea.*?>)#ig ) { # no closing textarea... } else { $c = pos($$dataref) - 1; my $l = length($1); $TAG{'='} = substr( $$dataref, $tempstart, $c - $tempstart - $l + 1 ); } } $INTAG = 0; $TAG{'/'}++ if ( $xml && substr( $$dataref, $c - 1, 1 ) eq '/' ); &$callbackfunc( $CURTAG, \%TAG, $dataref, $tagstart, $c - $tagstart + 1, $fref ) if ( $usetagmap == 0 || defined $tagmap->{$LCCURTAG} ); $CURTAG = $LCCURTAG = ''; %TAG = (); next; } if ($INTAG) { $ELEMENT = ''; $VALUE = undef; # eat whitespace pos($$dataref) = $c; if ( $$dataref !~ m/[^ \t\r\n]/g ) { $c = $LEN; next; # should we really abort? } $start = pos($$dataref) - 1; if ( $$dataref !~ m/[ \t\r\n<>=]/g ) { $c = $LEN; next; # should we really abort? } $c = pos($$dataref) - 1; if ( $c > $start ) { $ELEMENT = substr( $$dataref, $start, $c - $start ); chop $ELEMENT if ( $xml && substr( $ELEMENT, -1, 1 ) eq '/' ); } $cc = substr( $$dataref, $c, 1 ); if ( $cc ne '>' ) { # eat whitespace if ( $cc =~ tr/ \t\r\n// ) { $c++ while ( substr( $$dataref, $c, 1 ) =~ tr/ \t\r\n// ); } if ( substr( $$dataref, $c, 1 ) eq '=' ) { $c++; $start = $c; my $p = substr( $$dataref, $c, 1 ); if ( $p eq '"' || $p eq '\'' ) { $c++; $start++; $c = index( $$dataref, $p, $c ); if ( $c < 0 ) { $c = $LEN; next; } # Bad HTML $VALUE = substr( $$dataref, $start, $c - $start ); $c++; pos($$dataref) = $c; } else { pos($$dataref) = $c; if ( $$dataref !~ /[ \t\r\n>]/g ) { $c = $LEN; } else { $c = pos($$dataref) - 1; $VALUE = substr( $$dataref, $start, $c - $start ); chop $VALUE if ( $xml && substr( $$dataref, $c - 1, 2 ) eq '/>' ); } } if ( substr( $$dataref, $c, 1 ) =~ tr/ \t\r\n// ) { if ( $$dataref !~ /[^ \t\r\n]/g ) { $c = $LEN; next; # should we really abort? } $c = pos($$dataref) - 1; } } } # if $c ne '>' $c--; $TAG{$ELEMENT} = $VALUE if ( $ELEMENT ne '' || ( $xml && $ELEMENT ne '/' ) ); } } # finish off any tags we had going if ($INTAG) { &$callbackfunc( $CURTAG, \%TAG, $dataref, $tagstart, $c - $tagstart + 1, $fref ) if ( $usetagmap == 0 || defined $tagmap->{$LCCURTAG} ); } $DR = undef; # void dataref pointer }################################################################=item B<html_find_tags_rewrite>Params: $position, $length, $replacementReturn: nothinghtml_find_tags_rewrite() is used to 'rewrite' an HTML stream fromwithin an html_find_tags() callback function. In general, you canthink of html_find_tags_rewrite working as:substr(DATA, $position, $length) = $replacementWhere DATA is the current HTML string the html parser is using.The reason you need to use this function and not substr() isbecause a few internal parser pointers and counters need to beadjusted to accomodate the changes.If you want to remove a piece of the string, just set thereplacement to an empty string (''). If you wish to insert astring instead of overwrite, just set $length to 0; your stringwill be inserted at the indicated $position.=cut sub html_find_tags_rewrite { return if ( !defined $DR ); my ( $pos, $len, $replace_str ) = @_; # replace the data substr( $$DR, $pos, $len ) = $replace_str; # adjust pointer and length my $l = ( length($replace_str) - $len ); $c += $l; $LEN += $l; }################################################################ sub _html_find_tags_adjust { my ( $p, $l ) = @_; $c += $p; $LEN += $l; }} # end container################################################################=item B<html_link_extractor>Params: \$html_dataReturn: @urlsThe html_link_extractor() function uses the internal crawl tests toextract all the HTML links from the given HTML data stream.Note: html_link_extractor() does not unique the returned array ofdiscovered links, nor does i
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -