0001"""
0002utilities for manipulating html links
0003"""
0004
0005
0006from htmlserialize import decodeAndParseHTML, tostring
0007import urlparse
0008import re
0009
0010def fixup_text_links(doc, link_repl_func, remove_base_tags=True):
0011 """
0012 fixup_links(), but work on text and returns text
0013 """
0014 doc = decodeAndParseHTML(doc)
0015 fixup_links(doc, link_repl_func, remove_base_tags=remove_base_tags)
0016 return tostring(doc)
0017
0018def fixup_links(doc, link_repl_func,
0019 remove_base_tags=True):
0020 """
0021 Takes a given document (already parsed by lxml) and modifies it
0022 in-place. Every link is passed through link_repl_func, and the
0023 output of that function replaces the link.
0024 """
0025 if remove_base_tags:
0026 resolve_base_tags_in_document(doc)
0027
0028 for attrib in 'href', 'src':
0029 els = doc.xpath('//*[@%s]' % attrib)
0030 for el in els:
0031 el.attrib[attrib] = link_repl_func(el.attrib[attrib])
0032
0033 fixup_css_links(doc, link_repl_func)
0034
0035def resolve_base_tags_in_document(doc):
0036 """
0037 removes all html <base href=""> tags
0038 from the document given.
0039 """
0040 base_href = None
0041 basetags = doc.xpath('//base[@href]')
0042 for b in basetags:
0043 base_href = b.attrib['href']
0044 b.getparent().remove(b)
0045 if base_href is None:
0046 return
0047
0048
0049 def link_repl(href):
0050 return urlparse.urljoin(base_href, href)
0051 fixup_links(doc, link_repl, remove_base_tags=False)
0052
0053CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
0054def fixup_css_links(doc, link_repl_func):
0055 """
0056 prepends url(...) in css style elements to be
0057 absolute links based on base_uri
0058 """
0059 def absuri(matchobj):
0060 return 'url(%s)' % link_repl_func(matchobj.group(1))
0061 els = doc.xpath('//head/style')
0062 for el in els:
0063 if el.text:
0064 el.text = re.sub(CSS_URL_PAT,absuri,el.text)