From e1b321db7331752a3aea8dd7070dd0db4c60c51d Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 16 Feb 2012 16:33:40 -0500 Subject: It's a start, at least. --- bs4/builder/_lxml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index cc3cb86..985a030 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -66,7 +66,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): pass def start(self, name, attrs): - self.soup.handle_starttag(name, attrs) + # XXX namespace + self.soup.handle_starttag(name, None, attrs) def end(self, name): self.soup.endData() -- cgit v1.2.3 From 2ccae07967bb15f6bad6ba262411ac47bcbb98e7 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 21 Feb 2012 12:19:56 -0500 Subject: Added nsprefix argument to the tag class. --- bs4/builder/_lxml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 985a030..ad566e6 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -67,7 +67,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def start(self, name, attrs): # XXX namespace - self.soup.handle_starttag(name, None, attrs) + self.soup.handle_starttag(name, None, None, attrs) def end(self, name): self.soup.endData() -- cgit v1.2.3 From 2b6af1e6204461e89338ae452c3bc742d0d1fa0f Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Tue, 21 Feb 2012 13:41:58 -0500 Subject: Have lxml invert namespace maps as they come in and set each tag's prefix appropriately. --- bs4/builder/_lxml.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index ad566e6..7fccb8e 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None + self.nsmaps = [] + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): @@ -63,16 +72,31 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.parser.close() def close(self): - pass - - def start(self, name, attrs): - # XXX namespace - self.soup.handle_starttag(name, None, None, attrs) + self.namespaces.clear() + + def start(self, name, attrs, nsmap={}): + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0: + self.nsmaps.append(None) + else: + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + if "{" in name: + import pdb; pdb.set_trace() + namespace, name = self._getNsTag(name) + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] self.soup.handle_endtag(name) + self.nsmaps.pop() def pi(self, target, data): pass -- cgit v1.2.3 From d0868034b9156862d562ec2544842f4598a9ab76 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 22 Feb 2012 08:18:11 -0500 Subject: Treat a new namespace mapping as a set of attributes on the tag that defines it, so we don't lose the mappings. --- bs4/builder/_lxml.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 7fccb8e..5175f36 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -5,7 +5,7 @@ __all__ = [ import collections from lxml import etree -from bs4.element import Comment, Doctype +from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.builder import ( FAST, HTML, @@ -42,7 +42,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None - self.nsmaps = [] + self.nsmaps = None def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag @@ -77,13 +77,23 @@ class LXMLTreeBuilderForXML(TreeBuilder): def start(self, name, attrs, nsmap={}): nsprefix = None # Invert each namespace map as it comes in. - if len(nsmap) == 0: + if len(nsmap) == 0 and self.nsmaps != None: + # There are namespaces in play, so we need to keep track + # of when they start and end self.nsmaps.append(None) - else: + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + if self.nsmaps is None: + self.nsmaps = [] inverted_nsmap = dict((value, key) for key, value in nsmap.items()) self.nsmaps.append(inverted_nsmap) - if "{" in name: - import pdb; pdb.set_trace() + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace namespace, name = self._getNsTag(name) if namespace is not None: for inverted_nsmap in reversed(self.nsmaps): @@ -96,7 +106,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup.endData() completed_tag = self.soup.tagStack[-1] self.soup.handle_endtag(name) - self.nsmaps.pop() + if self.nsmaps != None: + self.nsmaps.pop() + if len(self.nsmaps) == 0: + self.nsmaps = None + def pi(self, target, data): pass -- cgit v1.2.3 From 6e4b4dfffbd6e8d465aebd009108654003da338b Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 22 Feb 2012 08:28:03 -0500 Subject: Added comments. --- bs4/builder/_lxml.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 5175f36..77660a4 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -78,8 +78,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): nsprefix = None # Invert each namespace map as it comes in. if len(nsmap) == 0 and self.nsmaps != None: - # There are namespaces in play, so we need to keep track - # of when they start and end + # There are no new namespaces for this tag, but namespaces + # are in play, so we need a separate tag stack to know + # when they end. self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. @@ -109,9 +110,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): if self.nsmaps != None: self.nsmaps.pop() if len(self.nsmaps) == 0: + # Namespaces are no longer in play, so don't bother keeping + # track of the namespace stack. self.nsmaps = None - def pi(self, target, data): pass -- cgit v1.2.3 From fcefebe15290b9ff44934efa73fb07c70ebf5171 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 23 Feb 2012 12:23:12 -0500 Subject: Fixed handling of the closing of namespaced tags. --- bs4/builder/_lxml.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 870d59e..e5e30d4 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -106,7 +106,14 @@ class LXMLTreeBuilderForXML(TreeBuilder): def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] - self.soup.handle_endtag(name) + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) if self.nsmaps != None: # This tag, or one of its parents, introduced a namespace # mapping, so pop it off the stack. -- cgit v1.2.3