8 files changed, 112 insertions, 26 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 13dac85..2dd0521 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -193,9 +193,9 @@ class BeautifulSoup(Tag):
         self.tagStack = []
         self.pushTag(self)
 
-    def new_tag(self, name, **attrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
         """Create a new tag associated with this soup."""
-        return Tag(None, self.builder, name, attrs)
+        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
 
     def new_string(self, s):
         """Create a new NavigableString associated with this soup."""
@@ -272,7 +272,7 @@ class BeautifulSoup(Tag):
             mostRecentTag = self.popTag()
         return mostRecentTag
 
-    def handle_starttag(self, name, attrs):
+    def handle_starttag(self, name, namespace, nsprefix, attrs):
         """Push a start tag on to the stack.
 
         If this method returns None, the tag was rejected by the
@@ -281,7 +281,7 @@ class BeautifulSoup(Tag):
         don't call handle_endtag.
         """
 
-        #print "Start tag %s: %s" % (name, attrs)
+        # print "Start tag %s: %s" % (name, attrs)
         self.endData()
 
         if (self.parse_only and len(self.tagStack) <= 1
@@ -289,8 +289,8 @@ class BeautifulSoup(Tag):
                  or not self.parse_only.search_tag(name, attrs))):
             return None
 
-        tag = Tag(self, self.builder, name, attrs, self.currentTag,
-                  self.previous_element)
+        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+                  self.currentTag, self.previous_element)
         if tag is None:
             return tag
         if self.previous_element:
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 0d7a1a9..7ce69aa 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -8,6 +8,7 @@ from bs4.builder import (
     HTML_5,
     HTMLTreeBuilder,
     )
+from bs4.element import NamespacedAttribute
 import html5lib
 from html5lib.constants import (
     DataLossWarning,
@@ -58,9 +59,6 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
 
     def __init__(self, soup, namespaceHTMLElements):
         self.soup = soup
-        if namespaceHTMLElements:
-            warnings.warn("namespaceHTMLElements not supported yet",
-                          DataLossWarning)
         super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 
     def documentClass(self):
@@ -76,9 +74,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
         self.soup.object_was_parsed(doctype)
 
     def elementClass(self, name, namespace):
-        if namespace is not None:
-            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
-        tag = self.soup.new_tag(name)
+        tag = self.soup.new_tag(name, namespace)
         return Element(tag, self.soup, namespace)
 
     def commentClass(self, data):
@@ -144,6 +140,8 @@ class Element(html5lib.treebuilders._base.Node):
     def setAttributes(self, attributes):
         if attributes is not None and attributes != {}:
             for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    name = NamespacedAttribute(*name)
                 self.element[name] =  value
             # The attributes may contain variables that need substitution.
             # Call set_up_substitutions manually.
@@ -189,7 +187,7 @@ class Element(html5lib.treebuilders._base.Node):
                     TextNode(child, self.soup))
 
     def cloneNode(self):
-        tag = self.soup.new_tag(self.element.name)
+        tag = self.soup.new_tag(self.element.name, self.namespace)
         node = Element(tag, self.soup, self.namespace)
         for key,value in self.attributes:
             node.attributes[key] = value
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 62473cf..c307ff8 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -40,7 +40,8 @@ HTMLPARSER = 'html.parser'
 
 class BeautifulSoupHTMLParser(HTMLParser):
     def handle_starttag(self, name, attrs):
-        self.soup.handle_starttag(name, dict(attrs))
+        # XXX namespace
+        self.soup.handle_starttag(name, None, None, dict(attrs))
 
     def handle_endtag(self, name):
         self.soup.handle_endtag(name)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index cc3cb86..870d59e 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -5,7 +5,7 @@ __all__ = [
 
 import collections
 from lxml import etree
-from bs4.element import Comment, Doctype
+from bs4.element import Comment, Doctype, NamespacedAttribute
 from bs4.builder import (
     FAST,
     HTML,
@@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             parser = parser(target=self, strip_cdata=False)
         self.parser = parser
         self.soup = None
+        self.nsmaps = None
+
+    def _getNsTag(self, tag):
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == '{':
+            return tuple(tag[1:].split('}', 1))
+        else:
+            return (None, tag)
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None):
@@ -63,15 +72,49 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         self.parser.close()
 
     def close(self):
-        pass
-
-    def start(self, name, attrs):
-        self.soup.handle_starttag(name, attrs)
+        self.nsmaps = None
+
+    def start(self, name, attrs, nsmap={}):
+        nsprefix = None
+        # Invert each namespace map as it comes in.
+        if len(nsmap) == 0 and self.nsmaps != None:
+            # There are no new namespaces for this tag, but namespaces
+            # are in play, so we need a separate tag stack to know
+            # when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+            if self.nsmaps is None:
+                self.nsmaps = []
+            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            self.nsmaps.append(inverted_nsmap)
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            attrs = attrs.copy()
+            for prefix, namespace in nsmap.items():
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+                attrs[attribute] = namespace
+        namespace, name = self._getNsTag(name)
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
 
     def end(self, name):
         self.soup.endData()
         completed_tag = self.soup.tagStack[-1]
         self.soup.handle_endtag(name)
+        if self.nsmaps != None:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            self.nsmaps.pop()
+            if len(self.nsmaps) == 0:
+                # Namespaces are no longer in play, so don't bother keeping
+                # track of the namespace stack.
+                self.nsmaps = None
 
     def pi(self, target, data):
         pass
diff --git a/bs4/element.py b/bs4/element.py
index 997378a..fdb90e0 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -22,6 +22,20 @@ def _alias(attr):
     return alias
 
 
+class NamespacedAttribute(object):
+
+    def __init__(self, namespace_abbreviation, name, namespace):
+        self.namespace_abbreviation = namespace_abbreviation
+        self.name = name
+        self.namespace = namespace
+
+    def __str__(self):
+        name = self.name
+        if self.namespace_abbreviation:
+            name = self.namespace_abbreviation + ":" + name
+        return name
+
+
 class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -507,8 +521,8 @@ class Tag(PageElement):
 
     """Represents a found HTML tag with its attributes and contents."""
 
-    def __init__(self, parser=None, builder=None, name=None, attrs=None,
-                 parent=None, previous=None):
+    def __init__(self, parser=None, builder=None, name=None, namespace=None,
+                 nsprefix=None, attrs=None, parent=None, previous=None):
         "Basic constructor."
 
         if parser is None:
@@ -520,6 +534,8 @@ class Tag(PageElement):
         if name is None:
             raise ValueError("No value provided for new tag's name.")
         self.name = name
+        self.namespace = namespace
+        self.nsprefix = nsprefix
         if attrs is None:
             attrs = {}
         else:
@@ -779,7 +795,7 @@ class Tag(PageElement):
                         and '%SOUP-ENCODING%' in val):
                         val = self.substitute_encoding(val, eventual_encoding)
 
-                    decoded = (key + '='
+                    decoded = (str(key) + '='
                                + EntitySubstitution.substitute_xml(val, True))
                 attrs.append(decoded)
         close = ''
@@ -789,6 +805,10 @@ class Tag(PageElement):
         else:
             closeTag = '</%s>' % self.name
 
+        prefix = ''
+        if self.nsprefix:
+            prefix = self.nsprefix + ":"
+
         pretty_print = (indent_level is not None)
         if pretty_print:
             space = (' ' * (indent_level - 1))
@@ -809,7 +829,8 @@ class Tag(PageElement):
                 attribute_string = ' ' + ' '.join(attrs)
             if pretty_print:
                 s.append(space)
-            s.append('<%s%s%s>' % (self.name, attribute_string, close))
+            s.append('<%s%s%s%s>' % (
+                    prefix, self.name, attribute_string, close))
             if pretty_print:
                 s.append("\n")
             s.append(contents)
@@ -986,7 +1007,7 @@ class SoupStrainer(object):
     searchTag = search_tag
 
     def search(self, markup):
-        #print 'looking for %s in %s' % (self, markup)
+        # print 'looking for %s in %s' % (self, markup)
         found = None
         # If given a list of items, scan it for a text element that
         # matches.
@@ -1012,7 +1033,7 @@ class SoupStrainer(object):
         return found
 
     def _matches(self, markup, match_against):
-        #print "Matching %s against %s" % (markup, match_against)
+        # print "Matching %s against %s" % (markup, match_against)
         result = False
 
         if isinstance(markup, list) or isinstance(markup, tuple):
diff --git a/bs4/testing.py b/bs4/testing.py
index cc30e17..dc20812 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -198,6 +198,21 @@ class HTMLTreeBuilderSmokeTest(object):
         self.assertSoupEquals("&#x10000000000000;", expect)
         self.assertSoupEquals("&#1000000000;", expect)
 
+    def test_basic_namespaces(self):
+        """Parsers don't need to *understand* namespaces, but at the
+        very least they should not choke on namespaces or lose
+        data."""
+
+        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
+        soup = self.soup(markup)
+        self.assertEquals(markup, soup.encode())
+        html = soup.html
+        self.assertEquals('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
+        self.assertEquals(
+            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
+        self.assertEquals(
+            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
+
     #
     # Generally speaking, tests below this point are more tests of
     # Beautiful Soup than tests of the tree builders. But parsers are
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index bcb5ed2..6215185 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -17,3 +17,4 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
     def test_namespaced_public_doctype(self):
         # html.parser can't handle namespaced doctypes, so skip this one.
         pass
+
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index f39826a..6aa02cb 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -63,7 +63,6 @@ class TestFind(TreeTest):
         soup = self.soup(u'<h1>Räksmörgås</h1>')
         self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås')
 
-
 class TestFindAll(TreeTest):
     """Basic tests of the find_all() method."""
 
@@ -94,6 +93,14 @@ class TestFindAll(TreeTest):
         self.assertSelects(
             soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
 
+class TestFindAllBasicNamespaces(TreeTest):
+
+    def test_find_by_namespaced_name(self):
+        soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
+        self.assertEquals("4", soup.find("mathml:msqrt").string)
+        self.assertEquals("a", soup.find(attrs= { "svg:fill" : "red" }).name)
+
+
 class TestFindAllByName(TreeTest):
     """Test ways of finding tags by tag name."""