0001from lxml import etree
0002import re
0003
0004html_xsl = """
0005<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
0006 <xsl:output method="html" encoding="UTF-8" />
0007 <xsl:template match="/">
0008 <xsl:copy-of select="."/>
0009 </xsl:template>
0010</xsl:transform>
0011"""
0012
0013
0014pretty_html_xsl = """
0015<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
0016 <xsl:output method="html" indent="yes" />
0017 <xsl:template match="/">
0018 <xsl:copy-of select="."/>
0019 </xsl:template>
0020</xsl:transform>
0021"""
0022
0023html_transform = etree.XSLT(etree.XML(html_xsl))
0024pretty_html_transform = etree.XSLT(etree.XML(pretty_html_xsl))
0025
0026
0027
0028def tostring(doc,pretty = False):
0029 """
0030 return HTML string representation of the document given
0031
0032 note: this will create a meta http-equiv="Content" tag in the head
0033 and may replace any that are present
0034 """
0035
0036 if pretty:
0037 return str(pretty_html_transform(doc))
0038 else:
0039 return str(html_transform(doc))
0040
0041
0042
0043
0044
0045
0046def decodeAndParseHTML(text):
0047 """
0048 if an html meta tag specifying a charset can be matched,
0049 decode the text to a python unicode string before parsing
0050
0051 XXX - this is disabled and in camelCase for no good reason
0052 """
0053
0054
0055
0056
0057
0058
0059
0060
0061 content = etree.HTML(text)
0062 assert content is not None
0063 return content