0001import string
0002import re
0003from lxml import etree as et
0004from cStringIO import StringIO
0005
0006XHTML_ns = '{http://www.w3.org/1999/xhtml}'
0007
0008empty_elements = set(
0009    ['area', 'base', 'basefont', 'br', 'col', 'frame',
0010     'hr', 'img', 'input', 'isindex', 'link', 'meta',
0011     'param'])
0012noescape_elements = set(
0013    ['script', 'style'])
0014boolean_attributes = set(
0015    ['selected', 'checked', 'compact', 'declare',
0016     'defer', 'disabled', 'ismap', 'multiple', 'nohref',
0017     'noresize', 'noshade', 'nowrap'])
0018
0019def _write(file, node, encoding, namespaces, drop_attribs):
0020    # write XML to file
0021    tag = node.tag
0022    if isinstance(node, et._Comment):
0023        file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
0024    elif node is None: # Should be processing instruction
0025        file.write("<?%s?>" % _escape_cdata(node.text, encoding))
0026    else:
0027        if tag.startswith(XHTML_ns):
0028            tag = tag[len(XHTML_ns):]
0029        items = node.items()
0030        xmlns_items = [] # new namespaces in this scope
0031        try:
0032            if isinstance(tag, et.QName) or tag[:1] == "{":
0033                tag, xmlns = fixtag(tag, namespaces)
0034                if xmlns: xmlns_items.append(xmlns)
0035        except TypeError:
0036            et._raise_serialization_error(tag)
0037        file.write("<" + tag.encode(encoding))
0038        if items or xmlns_items:
0039            items.sort() # lexical order
0040            for k, v in items:
0041                try:
0042                    if isinstance(k, et.QName) or k[:1] == "{":
0043                        k, xmlns = et.fixtag(k, namespaces)
0044                        if xmlns: xmlns_items.append(xmlns)
0045                except TypeError:
0046                    et._raise_serialization_error(k)
0047                try:
0048                    if isinstance(v, et.QName):
0049                        v, xmlns = et.fixtag(v, namespaces)
0050                        if xmlns: xmlns_items.append(xmlns)
0051                except TypeError:
0052                    et._raise_serialization_error(v)
0053                if drop_attribs is not None and k.lower() in drop_attribs:
0054                    continue
0055                if k.lower() in boolean_attributes:
0056                    file.write(' %s' % k.encode(encoding))
0057                else:
0058                    file.write(" %s=\"%s\"" % (k.encode(encoding),
0059                                               _escape_attrib(v, encoding)))
0060            for k, v in xmlns_items:
0061                file.write(" %s=\"%s\"" % (k.encode(encoding),
0062                                           _escape_attrib(v, encoding)))
0063        if node.text or len(node):
0064            file.write(">")
0065            if node.text:
0066                file.write(_escape_cdata(node.text, encoding))
0067            for n in node:
0068                _write(file, n, encoding, namespaces, drop_attribs)
0069            file.write("</" + tag.encode(encoding) + ">")
0070        else:
0071            if node.tag.lower() in empty_elements:
0072                file.write('>')
0073            else:
0074                file.write('>')
0075                file.write("</" + tag.encode(encoding) + ">")
0076        for k, v in xmlns_items:
0077            del namespaces[v]
0078    if node.tail:
0079        file.write(_escape_cdata(node.tail, encoding))
0080
0081def _escape_attrib(text, encoding=None, replace=string.replace):
0082    # escape attribute value
0083    try:
0084        if encoding:
0085            try:
0086                text = text.encode(encoding)
0087            except UnicodeError:
0088                return _encode_entity(text)
0089        text = replace(text, "&", "&amp;")
0090        text = replace(text, "'", "&apos;") # FIXME: overkill
0091        text = replace(text, "\"", "&quot;")
0092        text = replace(text, "<", "&lt;")
0093        text = replace(text, ">", "&gt;")
0094        return text
0095    except (TypeError, AttributeError):
0096        _raise_serialization_error(text)
0097
0098def _escape_cdata(text, encoding=None, replace=string.replace):
0099    # escape character data
0100    try:
0101        if encoding:
0102            try:
0103                text = text.encode(encoding)
0104            except UnicodeError:
0105                return _encode_entity(text)
0106        text = replace(text, "&", "&amp;")
0107        text = replace(text, "<", "&lt;")
0108        text = replace(text, ">", "&gt;")
0109        return text
0110    except (TypeError, AttributeError):
0111        _raise_serialization_error(text)
0112
0113def fixtag(tag, namespaces):
0114    # given a decorated tag (of the form {uri}tag), return prefixed
0115    # tag and namespace declaration, if any
0116    if isinstance(tag, QName):
0117        tag = tag.text
0118    namespace_uri, tag = string.split(tag[1:], "}", 1)
0119    prefix = namespaces.get(namespace_uri)
0120    if prefix is None:
0121        prefix = _namespace_map.get(namespace_uri)
0122        if prefix is None:
0123            prefix = "ns%d" % len(namespaces)
0124        namespaces[namespace_uri] = prefix
0125        if prefix == "xml":
0126            xmlns = None
0127        else:
0128            xmlns = ("xmlns:%s" % prefix, namespace_uri)
0129    else:
0130        xmlns = None
0131    return "%s:%s" % (prefix, tag), xmlns
0132
0133def tostring(node, encoding='utf-8', drop_attribs=[]):
0134    out = StringIO()
0135    _write(out, node, encoding, {}, drop_attribs)
0136    v = out.getvalue()
0137    v = v.replace('&apos;', "'")
0138    return v
0139
0140_escape = re.compile(u"[&<>\"\u0080-\uffff]+")
0141
0142def _encode_entity(text, pattern=_escape):
0143    # map reserved and non-ascii characters to numerical entities
0144    def escape_entities(m, map=_escape_map):
0145        out = []
0146        append = out.append
0147        for char in m.group():
0148            text = map.get(char)
0149            if text is None:
0150                text = "&#%d;" % ord(char)
0151            append(text)
0152        return string.join(out, "")
0153    try:
0154        return pattern.sub(escape_entities, text).encode("ascii")
0155    except TypeError:
0156        _raise_serialization_error(text)