#! /usr/bin/env python

"""A simple library for modelling XML in python. It's a bit like a super-lightweight DOM, at the moment immutable (or rather, designed to be - the objects are actually entirely mutable, but there's no validation on changes), with the intention of being structurally unable to express ill-formed XML, and functionally unable to express invalid XML, with support for sophisticated rules for the validation. Validation rules are expressed by configuring Tag objects, which are used as factories for Element objects.

At the moment, validation of element content is highly limited: it does not come anywhere close to validating against the kind of rules expressible in a DTD. However, it actually does pretty well for the kind of rules you get in practice, which tend to look like (A|B|C)+.

There is no support for generating Tag instances from DTDs (or schemas or whatever), but that is an eventual goal.

This program comes without any warranty, to the extent permitted by applicable law. Redistribution and use in source and binary forms, with or without modification, are permitted.

Version 1, 2009-05-23

"""

from xml.sax.saxutils import escape, quoteattr
import re

PCDATA = "#"
REGEXP_TYPE = type(re.compile("."))

class Tag(object):
	def __init__(self, name, contentModel=(), attributeModel={}, block=True):
		self.name = name
		self.contentModel = contentModel
		self.attributeModel = attributeModel
		self.block = block
	def __call__(self, *content, **attrs):
		return Element(self, content, attrs)
	def validate(self, content, attrs):
		self.validateContent(content)
		self.validateAttributes(attrs)
	def validateContent(self, content):
		"This does a very simple validation that does not capture the full rigour of DTD. To do that would require implementing a deterministic pushdown automaton, which i can't be bothered to do."
		for child in content:
			if (isinstance(child, basestring)):
				assert (PCDATA in self.contentModel), "elements of type " + self.name + " cannot contain text"
			elif (isinstance(child, Element)):
				assert (child.tag.name in self.contentModel), "elements of type " + self.name + " cannot contain elements of type " + child.tag.name
			elif (isinstance(child, Comment)):
				pass # always okay
			elif (isinstance(child, ProcessingInstruction)):
				pass # always okay
			else:
				assert False, str(child) + " is not a node or string"
	def validateAttributes(self, attrs):
		for name, value in attrs.iteritems():
			try:
				validator = self.attributeModel[name]
				if (validator == None):
					pass # no specific validity rule
				elif (isinstance(validator, REGEXP_TYPE)):
					m = validator.match(value)
					assert ((m != None) and (m.end() == len(value))), value + " is an invalid value for attribute " + name
				elif (callable(validator)):
					validator(value)
				else:
					assert False, "invalid validator: " + str(format)
			except KeyError:
				assert False, name + " is an illegal attribute for elements of type " + self.name

# python doesn't have abstract methods, so do it ourselves with a little decorator
def abstract(meth):
	msg = meth.__name__ + " is abstract"
	def abstract_meth(self, *args, **kwargs):
		raise NotImplementedError(msg)
	return abstract_meth

class Node(object):
	@abstract
	def isBlock(self): pass

class Element(Node):
	def __init__(self, tag, content, attrs):
		tag.validate(content, attrs)
		self.tag = tag
		self.content = content
		self.attrs = attrs
	def isBlock(self):
		return self.tag.block
	def __str__(self):
		s = "<" + self.tag.name
		for name, value in self.attrs.iteritems():
			s = s + " " + name + "=" + quoteattr(value)
		if (len(self.content) > 0):
			s = s + ">"
			afterBlock = False
			for child in self.content:
				if (isinstance(child, basestring)):
					s = s + escape(child)
				else:
					assert isinstance(child, Node), "child was neither a string nor a node: " + child
					if (child.isBlock()):
						if (not afterBlock): s = s + "\n"
					s = s + str(child)
					if (child.isBlock()):
						s = s + "\n"
						afterBlock = True
			s = s + "</" + self.tag.name + ">"
		else:
			s = s + "/>"
		return s

class ProcessingInstruction(Node):
	def __init__(self, target, data=None, **attrs):
		assert ((data == None) or (len(attrs) == 0)), "processing instruction cannot have both literal data and pseudo-attributes"
		self.target = target
		if (data != None):
			assert (len(attrs) == 0), "a processing instruction cannot have both literal data and pseudo-attributes"
			self.data = data;
		elif (len(attrs) > 0):
			self.data = " ".join(map(lambda entry: entry[0] + "=" + quoteattr(entry[1]), attrs.iteritems()))
		else:
			self.data = None
	def isBlock(self):
		return True
	def __str__(self):
		if (self.data != None):
			return "<?" + self.target + " " + self.data + "?>"
		else:
			return "<?" + self.target + "?>"

class Comment(Node):
	def __init__(self, text):
		self.text = text
	def isBlock(self):
		return False
	def __str__(self):
		return "<!-- " + self.text + " -->"

pi = ProcessingInstruction
comment = Comment

# below is a demo of tomdom using a simplified subset of XHTML

# attribute formats
TEXT = None
import urlparse
def URI(uri):
	"This is crap, but it serves to demonstrate attribute validation with a function."
	uriParts = urlparse.urlsplit(uri)
	assert (uriParts!= None), "URI is invalid: " + uri # in practice, this is always true
	assert (not " " in uriParts[2]), "URI is invalid due to spaces in the path: " + uri
NMTOKEN = re.compile("[:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\xff\\-.0-9\xb7]+") # 8-bit subset!

# content models
html = Tag("html", ("head", "body"))
head = Tag("head", ("title",))
title = Tag("title", (PCDATA,))
body = Tag("body", ("h1", "p", "ul", "table"))
_inline = (PCDATA, "a", "em")
h1 = Tag("h1", _inline)
p = Tag("p", _inline, {"title": TEXT})
ul = Tag("ul", ("li",))
li = Tag("li", _inline)
a = Tag("a", (PCDATA, "em"), {"href": URI, "name": NMTOKEN, "title": TEXT}, False)
em = Tag("em", _inline, {}, False)
table = Tag("table", ("tr",))
tr = Tag("tr", ("th", "td"))
th = Tag("th", _inline)
td = Tag("td", _inline)

# now build some stuff
expenditures = {
	"mortgage": 50000.00,
	"furniture": 7000.00,
	"duck island": 1645.00,
}

mydoc = html(
	head(
		title("My Document"),
		comment("yes, i know this should go in the prologue"),
		pi("xml-stylesheet", href="mystylesheet.css", type="text/css"),
	),
	body(
		h1("This Is ", em("My"), " Document"),
		p("This is some text containing <escapable> content & also ", "in multiple ", "nodes."),
		ul(
			li(a("Alice", href="http://alice.org/", title="Alice's site")),
			li(a("Bob", href="http://bob.net/", title="Bob <3 12\" Vinyl"))
		),
		ul(
			*map(li, map(str, xrange(5)))
		),
		p(title="empty"),
		table(
			tr(th("Item"), th("Amount")),
			# this illustrates the slightly awkward syntax involved in using a generated set of rows followed by a literal row
			*(tuple((tr(td(entry[0]), td("%.2f" % (entry[1],))) for entry in expenditures.iteritems())) + 
			(tr(th("TOTAL"), th(str(sum(expenditures.itervalues())))),))
			# you need to tuplify the generation expression, wrap the literal row in a 1-tuple, and concatenate them, then explode the result - ugh
		)
	)
)

print mydoc

def fail():
	assert False, "TEST SHOULD HAVE FAILED!!!"

try:
	baddoc = html(
		body(
			"bad text",
		)
	)
	fail()
except AssertionError, e:
	print e

try:
	baddoc = html(
		p(p("text"))
	)
	fail()
except AssertionError, e:
	print e

try:
	baddoc = html(
		p(666)
	)
	fail()
except AssertionError, e:
	print e

try:
	baddoc = html(
		body(
			p(a("text", href="ceci n'est pas un URI"))
		)
	)
	fail()
except AssertionError, e:
	print e

try:
	baddoc = html(
		body(
			p(a("text", name="ceci n'est pas un nom"))
		)
	)
	fail()
except AssertionError, e:
	print e

try:
	baddoc = html(
		body(
			p(nosuchattr="bad attr")
		)
	)
	fail()
except AssertionError, e:
	print e