0001import string
0002import re
0003from lxml import etree as et
0004from cStringIO import StringIO
0005
0006XHTML_ns = '{http://www.w3.org/1999/xhtml}'
0007
0008empty_elements = set(
0009 ['area', 'base', 'basefont', 'br', 'col', 'frame',
0010 'hr', 'img', 'input', 'isindex', 'link', 'meta',
0011 'param'])
0012noescape_elements = set(
0013 ['script', 'style'])
0014boolean_attributes = set(
0015 ['selected', 'checked', 'compact', 'declare',
0016 'defer', 'disabled', 'ismap', 'multiple', 'nohref',
0017 'noresize', 'noshade', 'nowrap'])
0018
0019def _write(file, node, encoding, namespaces, drop_attribs):
0020
0021 tag = node.tag
0022 if isinstance(node, et._Comment):
0023 file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
0024 elif node is None:
0025 file.write("<?%s?>" % _escape_cdata(node.text, encoding))
0026 else:
0027 if tag.startswith(XHTML_ns):
0028 tag = tag[len(XHTML_ns):]
0029 items = node.items()
0030 xmlns_items = []
0031 try:
0032 if isinstance(tag, et.QName) or tag[:1] == "{":
0033 tag, xmlns = fixtag(tag, namespaces)
0034 if xmlns: xmlns_items.append(xmlns)
0035 except TypeError:
0036 et._raise_serialization_error(tag)
0037 file.write("<" + tag.encode(encoding))
0038 if items or xmlns_items:
0039 items.sort()
0040 for k, v in items:
0041 try:
0042 if isinstance(k, et.QName) or k[:1] == "{":
0043 k, xmlns = et.fixtag(k, namespaces)
0044 if xmlns: xmlns_items.append(xmlns)
0045 except TypeError:
0046 et._raise_serialization_error(k)
0047 try:
0048 if isinstance(v, et.QName):
0049 v, xmlns = et.fixtag(v, namespaces)
0050 if xmlns: xmlns_items.append(xmlns)
0051 except TypeError:
0052 et._raise_serialization_error(v)
0053 if drop_attribs is not None and k.lower() in drop_attribs:
0054 continue
0055 if k.lower() in boolean_attributes:
0056 file.write(' %s' % k.encode(encoding))
0057 else:
0058 file.write(" %s=\"%s\"" % (k.encode(encoding),
0059 _escape_attrib(v, encoding)))
0060 for k, v in xmlns_items:
0061 file.write(" %s=\"%s\"" % (k.encode(encoding),
0062 _escape_attrib(v, encoding)))
0063 if node.text or len(node):
0064 file.write(">")
0065 if node.text:
0066 file.write(_escape_cdata(node.text, encoding))
0067 for n in node:
0068 _write(file, n, encoding, namespaces, drop_attribs)
0069 file.write("</" + tag.encode(encoding) + ">")
0070 else:
0071 if node.tag.lower() in empty_elements:
0072 file.write('>')
0073 else:
0074 file.write('>')
0075 file.write("</" + tag.encode(encoding) + ">")
0076 for k, v in xmlns_items:
0077 del namespaces[v]
0078 if node.tail:
0079 file.write(_escape_cdata(node.tail, encoding))
0080
0081def _escape_attrib(text, encoding=None, replace=string.replace):
0082
0083 try:
0084 if encoding:
0085 try:
0086 text = text.encode(encoding)
0087 except UnicodeError:
0088 return _encode_entity(text)
0089 text = replace(text, "&", "&")
0090 text = replace(text, "'", "'")
0091 text = replace(text, "\"", """)
0092 text = replace(text, "<", "<")
0093 text = replace(text, ">", ">")
0094 return text
0095 except (TypeError, AttributeError):
0096 _raise_serialization_error(text)
0097
0098def _escape_cdata(text, encoding=None, replace=string.replace):
0099
0100 try:
0101 if encoding:
0102 try:
0103 text = text.encode(encoding)
0104 except UnicodeError:
0105 return _encode_entity(text)
0106 text = replace(text, "&", "&")
0107 text = replace(text, "<", "<")
0108 text = replace(text, ">", ">")
0109 return text
0110 except (TypeError, AttributeError):
0111 _raise_serialization_error(text)
0112
0113def fixtag(tag, namespaces):
0114
0115
0116 if isinstance(tag, QName):
0117 tag = tag.text
0118 namespace_uri, tag = string.split(tag[1:], "}", 1)
0119 prefix = namespaces.get(namespace_uri)
0120 if prefix is None:
0121 prefix = _namespace_map.get(namespace_uri)
0122 if prefix is None:
0123 prefix = "ns%d" % len(namespaces)
0124 namespaces[namespace_uri] = prefix
0125 if prefix == "xml":
0126 xmlns = None
0127 else:
0128 xmlns = ("xmlns:%s" % prefix, namespace_uri)
0129 else:
0130 xmlns = None
0131 return "%s:%s" % (prefix, tag), xmlns
0132
0133def tostring(node, encoding='utf-8', drop_attribs=[]):
0134 out = StringIO()
0135 _write(out, node, encoding, {}, drop_attribs)
0136 v = out.getvalue()
0137 v = v.replace(''', "'")
0138 return v
0139
0140_escape = re.compile(u"[&<>\"\u0080-\uffff]+")
0141
0142def _encode_entity(text, pattern=_escape):
0143
0144 def escape_entities(m, map=_escape_map):
0145 out = []
0146 append = out.append
0147 for char in m.group():
0148 text = map.get(char)
0149 if text is None:
0150 text = "&#%d;" % ord(char)
0151 append(text)
0152 return string.join(out, "")
0153 try:
0154 return pattern.sub(escape_entities, text).encode("ascii")
0155 except TypeError:
0156 _raise_serialization_error(text)