📄 htmlparser.py
字号:
if not match:
return -1
if report:
j = match.start()
self.handle_comment(rawdata[i+4: j])
j = match.end()
return j
# Internal - parse script (javascript etc) code, return end or -1 if not terminated
# This handler added abp 29-09-03
def parse_script_tag(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+7].lower() == '<script', 'unexpected call to parse_script_tag()'
match = jscriptclose.search(rawdata, i+7)
if not match:
return -1
if report:
j = match.start()
# Handle only javascript tags for time being
m = jscriptsrc.search(rawdata[i+7: j])
# We handle only full javascript source links of the form
# src="http://foo.com/bar/script.js"
if m:
matchstr = m.group()
attrs=[]
try:
tag, href = matchstr.split('=')
href = href.replace("'","")
href = href.replace('"', '')
attrs.append((tag, href))
except:
pass
self.handle_javascript('script', attrs)
j = match.end()
return j
# Internal -- parse java applet tag, return end or -1 if not terminated
# This handler added abp 29-09-03
def parse_applet_tag(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+7].lower() == '<applet', 'unexpected call to parse_applet_tag()'
match = appletclose.search(rawdata, i+7)
if not match:
return -1
if report:
j = match.start()
attrs = []
for rexps in (appletcode, appletcodebase):
m = rexps.search(rawdata[i+7: j])
if m:
matchstr = m.group()
try:
tag, href = matchstr.split('=')
href = href.replace("'","")
href = href.replace('"', '')
attrs.append((tag, href))
except:
pass
self.handle_java_applet_tag('applet', attrs)
j = match.end()
return j
# Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
match = piclose.search(rawdata, i+2) # >
if not match:
return -1
j = match.start()
self.handle_pi(rawdata[i+2: j])
j = match.end()
return j
# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
m = attrfind.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
# Modification, added empty string in list this skips
# some errored html tags without quitting. (Anand)
if end not in ("", ",", ">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
self.error("junk characters in start tag: %s"
% `rawdata[k:endpos][:20]`)
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode()
return endpos
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
if next == ">":
return j + 1
if next == "/":
if rawdata.startswith("/>", j):
return j + 2
if rawdata.startswith("/", j):
# buffer boundary
return -1
# else bogus input
self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if next == "":
# end of input
return -1
if next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
self.updatepos(i, j)
self.error("malformed start tag")
raise AssertionError("we should not get here!")
# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
match = endendtag.search(rawdata, i+1) # >
if not match:
return -1
j = match.end()
match = endtagfind.match(rawdata, i) # </ + tag + >
if not match:
self.error("bad end tag: %s" % `rawdata[i:j]`)
tag = match.group(1)
self.handle_endtag(tag.lower())
self.clear_cdata_mode()
return j
# Overridable -- finish processing of start+end tag: <tag.../>
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.handle_endtag(tag)
# Overridable -- handle start tag
def handle_starttag(self, tag, attrs):
pass
# Overridable -- handle end tag
def handle_endtag(self, tag):
pass
# Overridable -- handle character reference
def handle_charref(self, name):
pass
# Overridable -- handle entity reference
def handle_entityref(self, name):
pass
# Overridable -- handle data
def handle_data(self, data):
pass
# Overridable -- handle comment
def handle_comment(self, data):
pass
# Overridable - handles java scripts
def handle_javascript(self, tag, attrs):
pass
# Overridable - handles java applet tags
def handle_java_applet_tag(self, tag, attrs):
pass
# Overridable -- handle declaration
def handle_decl(self, decl):
pass
# Overridable -- handle processing instruction
def handle_pi(self, data):
pass
def unknown_decl(self, data):
self.error("unknown declaration: " + `data`)
# Internal -- helper to remove special character quoting
def unescape(self, s):
if '&' not in s:
return s
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace(""", '"')
s = s.replace("&", "&") # Must be last
return s
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -