diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-23 13:55:51 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-23 13:55:51 -0500 |
commit | 97b54c4bdbee0f109c444b50d8102ae8d7abb7c4 (patch) | |
tree | 8feb3c4387fa5dc67c810f76c9a831ebf523898d /bs4/builder/_lxml.py | |
parent | 328204928bd22ca9e8aeac0a3208645d9f82f264 (diff) | |
parent | deaeb40977719ea821a62f41d75e2c9f48559094 (diff) |
The namespace stuff seems to work, and it's definitely an improvement on the status quo, so in it goes.
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r-- | bs4/builder/_lxml.py | 62 |
1 files changed, 56 insertions, 6 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index cc3cb86..e5e30d4 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -5,7 +5,7 @@ __all__ = [ import collections from lxml import etree -from bs4.element import Comment, Doctype +from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.builder import ( FAST, HTML, @@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None + self.nsmaps = None + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): @@ -63,15 +72,56 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.parser.close() def close(self): - pass - - def start(self, name, attrs): - self.soup.handle_starttag(name, attrs) + self.nsmaps = None + + def start(self, name, attrs, nsmap={}): + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and self.nsmaps != None: + # There are no new namespaces for this tag, but namespaces + # are in play, so we need a separate tag stack to know + # when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + if self.nsmaps is None: + self.nsmaps = [] + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + namespace, name = self._getNsTag(name) + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] - self.soup.handle_endtag(name) + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if self.nsmaps != None: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + if len(self.nsmaps) == 0: + # Namespaces are no longer in play, so don't bother keeping + # track of the namespace stack. + self.nsmaps = None def pi(self, target, data): pass |