📄 optimized_al.py
字号:
"""This script duplicates adjacency_list.py, but optimizes the loadingof XML nodes to be based on a "flattened" datamodel. Any number of XML documents, each of arbitrary complexity, can be loaded in their entirety via a single query which joins on only three tables."""################################# PART I - Imports/Coniguration ###########################################from sqlalchemy import *from sqlalchemy.orm import *import sys, os, StringIO, reimport logginglogging.basicConfig()# uncomment to show SQL statements#logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)# uncomment to show SQL statements and result sets#logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)from elementtree import ElementTreefrom elementtree.ElementTree import Element, SubElementmeta = MetaData()meta.bind = 'sqlite://'################################# PART II - Table Metadata ########################################### # stores a top level record of an XML document. documents = Table('documents', meta, Column('document_id', Integer, primary_key=True), Column('filename', String(30), unique=True),)# stores XML nodes in an adjacency list model. This corresponds to # Element and SubElement objects.elements = Table('elements', meta, Column('element_id', Integer, primary_key=True), Column('parent_id', Integer, ForeignKey('elements.element_id')), Column('document_id', Integer, ForeignKey('documents.document_id')), Column('tag', Unicode(30), nullable=False), Column('text', Unicode), Column('tail', Unicode) )# stores attributes. This corresponds to the dictionary of attributes# stored by an Element or SubElement.attributes = Table('attributes', meta, Column('element_id', Integer, ForeignKey('elements.element_id'), primary_key=True), Column('name', Unicode(100), nullable=False, primary_key=True), Column('value', Unicode(255)))meta.create_all()#################################### PART III - Model ############################################## our document class. contains a string name,# and the ElementTree root element. class Document(object): def __init__(self, name, element): self.filename = name self.element = element def __str__(self): buf = StringIO.StringIO() self.element.write(buf) return buf.getvalue()#################################### PART IV - Persistence Mapping #################################### Node class. a non-public class which will represent # the DB-persisted Element/SubElement object. We cannot create mappers for# ElementTree elements directly because they are at the very least not new-style # classes, and also may be backed by native implementations.# so here we construct an adapter.class _Node(object): pass# Attribute class. also internal, this will represent the key/value attributes stored for # a particular Node.class _Attribute(object): def __init__(self, name, value): self.name = name self.value = value# setup mappers. Document will eagerly load a list of _Node objects.# they will be ordered in primary key/insert order, so that we can reconstruct# an ElementTree structure from the list.mapper(Document, documents, properties={ '_nodes':relation(_Node, lazy=False, cascade="all, delete-orphan")})# the _Node objects change the way they load so that a list of _Nodes will organize# themselves hierarchically using the ElementTreeMarshal. this depends on the ordering of# nodes being hierarchical as well; relation() always applies at least ROWID/primary key# ordering to rows which will suffice.mapper(_Node, elements, properties={ 'children':relation(_Node, lazy=None), # doesnt load; used only for the save relationship 'attributes':relation(_Attribute, lazy=False, cascade="all, delete-orphan"), # eagerly load attributes})mapper(_Attribute, attributes)# define marshalling functions that convert from _Node/_Attribute to/from ElementTree objects.# this will set the ElementTree element as "document._element", and append the root _Node# object to the "_nodes" mapped collection.class ElementTreeMarshal(object): def __get__(self, document, owner): if document is None: return self if hasattr(document, '_element'): return document._element nodes = {} root = None for node in document._nodes: if node.parent_id is not None: parent = nodes[node.parent_id] elem = ElementTree.SubElement(parent, node.tag) nodes[node.element_id] = elem else: parent = None elem = root = ElementTree.Element(node.tag) nodes[node.element_id] = root for attr in node.attributes: elem.attrib[attr.name] = attr.value elem.text = node.text elem.tail = node.tail document._element = ElementTree.ElementTree(root) return document._element def __set__(self, document, element): def traverse(node): n = _Node() n.tag = unicode(node.tag) n.text = unicode(node.text) n.tail = unicode(node.tail) document._nodes.append(n) n.children = [traverse(n2) for n2 in node] n.attributes = [_Attribute(unicode(k), unicode(v)) for k, v in node.attrib.iteritems()] return n traverse(element.getroot()) document._element = element def __delete__(self, document): del document._element document._nodes = []# override Document's "element" attribute with the marshaller.Document.element = ElementTreeMarshal()########################################### PART V - Basic Persistence Example ############################line = "\n--------------------------------------------------------"# save to DBsession = create_session()# get ElementTree documentsfor file in ('test.xml', 'test2.xml', 'test3.xml'): filename = os.path.join(os.path.dirname(sys.argv[0]), file) doc = ElementTree.parse(filename) session.save(Document(file, doc))print "\nSaving three documents...", linesession.flush()print "Done."# clear session (to illustrate a full load), restoresession.clear()print "\nFull text of document 'text.xml':", linedocument = session.query(Document).filter_by(filename="test.xml").first()print document############################################ PART VI - Searching for Paths ######################################## manually search for a document which contains "/somefile/header/field1:hi"print "\nManual search for /somefile/header/field1=='hi':", lined = session.query(Document).join('_nodes', aliased=True).filter(and_(_Node.parent_id==None, _Node.tag==u'somefile')).\ join('children', aliased=True, from_joinpoint=True).filter(_Node.tag==u'header').\ join('children', aliased=True, from_joinpoint=True).filter(and_(_Node.tag==u'field1', _Node.text==u'hi')).\ one()print d# generalize the above approach into an extremely impoverished xpath function:def find_document(path, compareto): j = documents prev_elements = None query = session.query(Document) first = True for i, match in enumerate(re.finditer(r'/([\w_]+)(?:\[@([\w_]+)(?:=(.*))?\])?', path)): (token, attrname, attrvalue) = match.group(1, 2, 3) if first: query = query.join('_nodes', aliased=True).filter(_Node.parent_id==None) first = False else: query = query.join('children', aliased=True, from_joinpoint=True) query = query.filter(_Node.tag==token) if attrname: query = query.join('attributes', aliased=True, from_joinpoint=True) if attrvalue: query = query.filter(and_(_Attribute.name==attrname, _Attribute.value==attrvalue)) else: query = query.filter(_Attribute.name==attrname) return query.options(lazyload('_nodes')).filter(_Node.text==compareto).all()for path, compareto in ( (u'/somefile/header/field1', u'hi'), (u'/somefile/field1', u'hi'), (u'/somefile/header/field2', u'there'), (u'/somefile/header/field2[@attr=foo]', u'there') ): print "\nDocuments containing '%s=%s':" % (path, compareto), line print [d.filename for d in find_document(path, compareto)]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -