#! /usr/bin/env python """A simple library for modelling XML in python. It's a bit like a super-lightweight DOM, at the moment immutable (or rather, designed to be - the objects are actually entirely mutable, but there's no validation on changes), with the intention of being structurally unable to express ill-formed XML, and functionally unable to express invalid XML, with support for sophisticated rules for the validation. Validation rules are expressed by configuring Tag objects, which are used as factories for Element objects. At the moment, validation of element content is highly limited: it does not come anywhere close to validating against the kind of rules expressible in a DTD. However, it actually does pretty well for the kind of rules you get in practice, which tend to look like (A|B|C)+. There is no support for generating Tag instances from DTDs (or schemas or whatever), but that is an eventual goal. This program comes without any warranty, to the extent permitted by applicable law. Redistribution and use in source and binary forms, with or without modification, are permitted. Version 1, 2009-05-23 """ from xml.sax.saxutils import escape, quoteattr import re PCDATA = "#" REGEXP_TYPE = type(re.compile(".")) class Tag(object): def __init__(self, name, contentModel=(), attributeModel={}, block=True): self.name = name self.contentModel = contentModel self.attributeModel = attributeModel self.block = block def __call__(self, *content, **attrs): return Element(self, content, attrs) def validate(self, content, attrs): self.validateContent(content) self.validateAttributes(attrs) def validateContent(self, content): "This does a very simple validation that does not capture the full rigour of DTD. To do that would require implementing a deterministic pushdown automaton, which i can't be bothered to do." for child in content: if (isinstance(child, basestring)): assert (PCDATA in self.contentModel), "elements of type " + self.name + " cannot contain text" elif (isinstance(child, Element)): assert (child.tag.name in self.contentModel), "elements of type " + self.name + " cannot contain elements of type " + child.tag.name elif (isinstance(child, Comment)): pass # always okay elif (isinstance(child, ProcessingInstruction)): pass # always okay else: assert False, str(child) + " is not a node or string" def validateAttributes(self, attrs): for name, value in attrs.iteritems(): try: validator = self.attributeModel[name] if (validator == None): pass # no specific validity rule elif (isinstance(validator, REGEXP_TYPE)): m = validator.match(value) assert ((m != None) and (m.end() == len(value))), value + " is an invalid value for attribute " + name elif (callable(validator)): validator(value) else: assert False, "invalid validator: " + str(format) except KeyError: assert False, name + " is an illegal attribute for elements of type " + self.name # python doesn't have abstract methods, so do it ourselves with a little decorator def abstract(meth): msg = meth.__name__ + " is abstract" def abstract_meth(self, *args, **kwargs): raise NotImplementedError(msg) return abstract_meth class Node(object): @abstract def isBlock(self): pass class Element(Node): def __init__(self, tag, content, attrs): tag.validate(content, attrs) self.tag = tag self.content = content self.attrs = attrs def isBlock(self): return self.tag.block def __str__(self): s = "<" + self.tag.name for name, value in self.attrs.iteritems(): s = s + " " + name + "=" + quoteattr(value) if (len(self.content) > 0): s = s + ">" afterBlock = False for child in self.content: if (isinstance(child, basestring)): s = s + escape(child) else: assert isinstance(child, Node), "child was neither a string nor a node: " + child if (child.isBlock()): if (not afterBlock): s = s + "\n" s = s + str(child) if (child.isBlock()): s = s + "\n" afterBlock = True s = s + "" else: s = s + "/>" return s class ProcessingInstruction(Node): def __init__(self, target, data=None, **attrs): assert ((data == None) or (len(attrs) == 0)), "processing instruction cannot have both literal data and pseudo-attributes" self.target = target if (data != None): assert (len(attrs) == 0), "a processing instruction cannot have both literal data and pseudo-attributes" self.data = data; elif (len(attrs) > 0): self.data = " ".join(map(lambda entry: entry[0] + "=" + quoteattr(entry[1]), attrs.iteritems())) else: self.data = None def isBlock(self): return True def __str__(self): if (self.data != None): return "" else: return "" class Comment(Node): def __init__(self, text): self.text = text def isBlock(self): return False def __str__(self): return "" pi = ProcessingInstruction comment = Comment # below is a demo of tomdom using a simplified subset of XHTML # attribute formats TEXT = None import urlparse def URI(uri): "This is crap, but it serves to demonstrate attribute validation with a function." uriParts = urlparse.urlsplit(uri) assert (uriParts!= None), "URI is invalid: " + uri # in practice, this is always true assert (not " " in uriParts[2]), "URI is invalid due to spaces in the path: " + uri NMTOKEN = re.compile("[:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\xff\\-.0-9\xb7]+") # 8-bit subset! # content models html = Tag("html", ("head", "body")) head = Tag("head", ("title",)) title = Tag("title", (PCDATA,)) body = Tag("body", ("h1", "p", "ul", "table")) _inline = (PCDATA, "a", "em") h1 = Tag("h1", _inline) p = Tag("p", _inline, {"title": TEXT}) ul = Tag("ul", ("li",)) li = Tag("li", _inline) a = Tag("a", (PCDATA, "em"), {"href": URI, "name": NMTOKEN, "title": TEXT}, False) em = Tag("em", _inline, {}, False) table = Tag("table", ("tr",)) tr = Tag("tr", ("th", "td")) th = Tag("th", _inline) td = Tag("td", _inline) # now build some stuff expenditures = { "mortgage": 50000.00, "furniture": 7000.00, "duck island": 1645.00, } mydoc = html( head( title("My Document"), comment("yes, i know this should go in the prologue"), pi("xml-stylesheet", href="mystylesheet.css", type="text/css"), ), body( h1("This Is ", em("My"), " Document"), p("This is some text containing content & also ", "in multiple ", "nodes."), ul( li(a("Alice", href="http://alice.org/", title="Alice's site")), li(a("Bob", href="http://bob.net/", title="Bob <3 12\" Vinyl")) ), ul( *map(li, map(str, xrange(5))) ), p(title="empty"), table( tr(th("Item"), th("Amount")), # this illustrates the slightly awkward syntax involved in using a generated set of rows followed by a literal row *(tuple((tr(td(entry[0]), td("%.2f" % (entry[1],))) for entry in expenditures.iteritems())) + (tr(th("TOTAL"), th(str(sum(expenditures.itervalues())))),)) # you need to tuplify the generation expression, wrap the literal row in a 1-tuple, and concatenate them, then explode the result - ugh ) ) ) print mydoc def fail(): assert False, "TEST SHOULD HAVE FAILED!!!" try: baddoc = html( body( "bad text", ) ) fail() except AssertionError, e: print e try: baddoc = html( p(p("text")) ) fail() except AssertionError, e: print e try: baddoc = html( p(666) ) fail() except AssertionError, e: print e try: baddoc = html( body( p(a("text", href="ceci n'est pas un URI")) ) ) fail() except AssertionError, e: print e try: baddoc = html( body( p(a("text", name="ceci n'est pas un nom")) ) ) fail() except AssertionError, e: print e try: baddoc = html( body( p(nosuchattr="bad attr") ) ) fail() except AssertionError, e: print e