diff options
-rw-r--r-- | bs4/__init__.py | 2 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 34 | ||||
-rw-r--r-- | bs4/element.py | 7 |
3 files changed, 36 insertions, 7 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 7007796..2dd0521 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -281,7 +281,7 @@ class BeautifulSoup(Tag): don't call handle_endtag. """ - #print "Start tag %s: %s" % (name, attrs) + # print "Start tag %s: %s" % (name, attrs) self.endData() if (self.parse_only and len(self.tagStack) <= 1 diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index ad566e6..7fccb8e 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None + self.nsmaps = [] + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): @@ -63,16 +72,31 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.parser.close() def close(self): - pass - - def start(self, name, attrs): - # XXX namespace - self.soup.handle_starttag(name, None, None, attrs) + self.namespaces.clear() + + def start(self, name, attrs, nsmap={}): + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0: + self.nsmaps.append(None) + else: + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + if "{" in name: + import pdb; pdb.set_trace() + namespace, name = self._getNsTag(name) + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] self.soup.handle_endtag(name) + self.nsmaps.pop() def pi(self, target, data): pass diff --git a/bs4/element.py b/bs4/element.py index 73f225e..ab30951 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -805,6 +805,10 @@ class Tag(PageElement): else: closeTag = '</%s>' % self.name + prefix = '' + if self.nsprefix: + prefix = self.nsprefix + ":" + pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) @@ -825,7 +829,8 @@ class Tag(PageElement): attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) - s.append('<%s%s%s>' % (self.name, attribute_string, close)) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) |