⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sgmllib.py

📁 reduced python source for embedded apps
💻 PY
字号:
# A parser for SGML, using the derived class as static DTD.# XXX This only supports those SGML features used by HTML.# XXX There should be a way to distinguish between PCDATA (parsed# character data -- the normal case), RCDATA (replaceable character# data -- only char and entity references and end tags are special)# and CDATA (character data -- only end tags are special).import regeximport string# Regular expressions used for parsingincomplete = regex.compile( \	  '<!-?\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\|</?\|' + \	  '&#[a-zA-Z0-9]*\|&[a-zA-Z][a-zA-Z0-9]*\|&')entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]')charref = regex.compile('&#[a-zA-Z0-9]+;')starttagopen = regex.compile('<[a-zA-Z]')endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>')commentopen = regex.compile('<!--')# SGML parser base class -- find tags and call handler functions.# Usage: p = SGMLParser(); p.feed(data); ...; p.close().# The dtd is defined by deriving a class which defines methods# with special names to handle tags: start_foo and end_foo to handle# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.# (Tags are converted to lower case for this purpose.)  The data# between tags is passed to the parser by calling self.handle_data()# with some data as argument (the data may be split up in arbutrary# chunks).  Entity references are passed by calling# self.handle_entityref() with the entity reference as argument.class SGMLParser:	# Interface -- initialize and reset this instance	def __init__(self):		self.reset()	# Interface -- reset this instance.  Loses all unprocessed data	def reset(self):		self.rawdata = ''		self.stack = []		self.nomoretags = 0		self.literal = 0	# For derived classes only -- enter literal mode (CDATA) till EOF	def setnomoretags(self):		self.nomoretags = self.literal = 1	# For derived classes only -- enter literal mode (CDATA)	def setliteral(self, *args):		self.literal = 1	# Interface -- feed some data to the parser.  Call this as	# often as you want, with as little or as much text as you	# want (may include '\n').  (This just saves the text, all the	# processing is done by process() or close().)	def feed(self, data):		self.rawdata = self.rawdata + data		self.goahead(0)	# Interface -- handle the remaining data	def close(self):		self.goahead(1)	# Internal -- handle data as far as reasonable.  May leave state	# and data to be processed by a subsequent call.  If 'end' is	# true, force handling all data as if followed by EOF marker.	def goahead(self, end):		rawdata = self.rawdata		i = 0		n = len(rawdata)		while i < n:			if self.nomoretags:				self.handle_data(rawdata[i:n])				i = n				break			j = incomplete.search(rawdata, i)			if j < 0: j = n			if i < j: self.handle_data(rawdata[i:j])			i = j			if i == n: break			if rawdata[i] == '<':				if starttagopen.match(rawdata, i) >= 0:					if self.literal:						self.handle_data(rawdata[i])						i = i+1						continue					k = self.parse_starttag(i)					if k < 0: break					i = i + k					continue				k = endtag.match(rawdata, i)				if k >= 0:					j = i+k					self.parse_endtag(rawdata[i:j])					i = j					self.literal = 0					continue				if commentopen.match(rawdata, i) >= 0:					if self.literal:						self.handle_data(rawdata[i])						i = i+1						continue					k = self.parse_comment(i)					if k < 0: break					i = i+k					continue			elif rawdata[i] == '&':				k = charref.match(rawdata, i)				if k >= 0:					j = i+k					self.handle_charref(rawdata[i+2:j-1])					i = j					continue				k = entityref.match(rawdata, i)				if k >= 0:					j = i+k					self.handle_entityref(rawdata[i+1:j-1])					i = j					continue			else:				raise RuntimeError, 'neither < nor & ??'			# We get here only if incomplete matches but			# nothing else			k = incomplete.match(rawdata, i)			if k < 0: raise RuntimeError, 'no incomplete match ??'			j = i+k			if j == n: break # Really incomplete			self.handle_data(rawdata[i:j])			i = j		# end while		if end and i < n:			self.handle_data(rawdata[i:n])			i = n		self.rawdata = rawdata[i:]		# XXX if end: check for empty stack	# Internal -- parse comment, return length or -1 if not ternimated	def parse_comment(self, i):		rawdata = self.rawdata		if rawdata[i:i+4] <> '<!--':			raise RuntimeError, 'unexpected call to handle_comment'		try:			j = string.index(rawdata, '--', i+4)		except string.index_error:			return -1		self.handle_comment(rawdata[i+4: j])		j = j+2		n = len(rawdata)		while j < n and rawdata[j] in ' \t\n': j = j+1		if j == n: return -1 # Wait for final '>'		if rawdata[j] == '>':			j = j+1		else:			print '*** comment not terminated with >'			print repr(rawdata[j-5:j]), '*!*', repr(rawdata[j:j+5])		return j-i	# Internal -- handle starttag, return length or -1 if not terminated	def parse_starttag(self, i):		rawdata = self.rawdata		try:			j = string.index(rawdata, '>', i)		except string.index_error:			return -1		# Now parse the data between i+1 and j into a tag and attrs		attrs = []		tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*')		attrfind = regex.compile( \		  '[ \t\n]+\([a-zA-Z][a-zA-Z0-9]*\)' + \		  '\([ \t\n]*=[ \t\n]*' + \		     '\(\'[^\']*\';\|"[^"]*"\|[-a-zA-Z0-9./:+*%?!()_#]+\)\)?')		k = tagfind.match(rawdata, i+1)		if k < 0:			raise RuntimeError, 'unexpected call to parse_starttag'		k = i+1+k		tag = string.lower(rawdata[i+1:k])		while k < j:			l = attrfind.match(rawdata, k)			if l < 0: break			regs = attrfind.regs			a1, b1 = regs[1]			a2, b2 = regs[2]			a3, b3 = regs[3]			attrname = rawdata[a1:b1]			if '=' in rawdata[k:k+l]:				attrvalue = rawdata[a3:b3]				if attrvalue[:1] == '\'' == attrvalue[-1:] or \				   attrvalue[:1] == '"' == attrvalue[-1:]:					attrvalue = attrvalue[1:-1]			else:				attrvalue = ''			attrs.append(string.lower(attrname), attrvalue)			k = k + l		j = j+1		try:			method = getattr(self, 'start_' + tag)		except AttributeError:			try:				method = getattr(self, 'do_' + tag)			except AttributeError:				self.unknown_starttag(tag, attrs)				return j-i			method(attrs)			return j-i		self.stack.append(tag)		method(attrs)		return j-i	# Internal -- parse endtag	def parse_endtag(self, data):		if data[:2] <> '</' or data[-1:] <> '>':			raise RuntimeError, 'unexpected call to parse_endtag'		tag = string.lower(string.strip(data[2:-1]))		try:			method = getattr(self, 'end_' + tag)		except AttributeError:			self.unknown_endtag(tag)			return		if self.stack and self.stack[-1] == tag:			del self.stack[-1]		else:			print '*** Unbalanced </' + tag + '>'			print '*** Stack:', self.stack			found = None			for i in range(len(self.stack)):				if self.stack[i] == tag: found = i			if found <> None:				del self.stack[found:]		method()	# Example -- handle character reference, no need to override	def handle_charref(self, name):		try:			n = string.atoi(name)		except string.atoi_error:			self.unknown_charref(name)			return		if not 0 <= n <= 255:			self.unknown_charref(name)			return		self.handle_data(chr(n))	# Definition of entities -- derived classes may override	entitydefs = \		{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}	# Example -- handle entity reference, no need to override	def handle_entityref(self, name):		table = self.__class__.entitydefs		name = string.lower(name)		if table.has_key(name):			self.handle_data(table[name])		else:			self.unknown_entityref(name)			return	# Example -- handle data, should be overridden	def handle_data(self, data):		pass	# Example -- handle comment, could be overridden	def handle_comment(self, data):		pass	# To be overridden -- handlers for unknown objects	def unknown_starttag(self, tag, attrs): pass	def unknown_endtag(self, tag): pass	def unknown_charref(self, ref): pass	def unknown_entityref(self, ref): passclass TestSGML(SGMLParser):	def handle_data(self, data):		r = repr(data)		if len(r) > 72:			r = r[:35] + '...' + r[-35:]		print 'data:', r	def handle_comment(self, data):		r = repr(data)		if len(r) > 68:			r = r[:32] + '...' + r[-32:]		print 'comment:', r	def unknown_starttag(self, tag, attrs):		print 'start tag: <' + tag,		for name, value in attrs:			print name + '=' + '"' + value + '"',		print '>'	def unknown_endtag(self, tag):		print 'end tag: </' + tag + '>'	def unknown_entityref(self, ref):		print '*** unknown entity ref: &' + ref + ';'	def unknown_charref(self, ref):		print '*** unknown char ref: &#' + ref + ';'def test():	file = 'test.html'	f = open(file, 'r')	x = TestSGML()	while 1:		line = f.readline()		if not line:			x.close()			break		x.feed(line)#test()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -