summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-23 12:23:12 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-23 12:23:12 -0500
commitfcefebe15290b9ff44934efa73fb07c70ebf5171 (patch)
treec0b3ae8837a96975e1b88f3e2e9befc07a72e70c
parentb7749c50a2c96ccf6982cfa1ca02d883e31e0af9 (diff)
Fixed handling of the closing of namespaced tags.
-rw-r--r--NEWS.txt2
-rw-r--r--bs4/__init__.py9
-rw-r--r--bs4/builder/_lxml.py9
-rw-r--r--bs4/element.py7
-rw-r--r--bs4/testing.py11
-rw-r--r--bs4/tests/test_soup.py4
-rw-r--r--bs4/tests/test_tree.py17
7 files changed, 51 insertions, 8 deletions
diff --git a/NEWS.txt b/NEWS.txt
index fe2e0cc..c93541e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -8,6 +8,8 @@
to use namespace prefixes exactly as they're used in the original
document.
+* The string representation of a DOCTYPE always ends in a newline.
+
= 4.0.0b7 (20110223) =
* Upon decoding to string, any characters that can't be represented in
diff --git a/bs4/__init__.py b/bs4/__init__.py
index bf800ea..9b5c155 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -249,7 +249,7 @@ class BeautifulSoup(Tag):
self.previous_element = o
self.currentTag.contents.append(o)
- def _popToTag(self, name, inclusivePop=True):
+ def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
@@ -262,7 +262,8 @@ class BeautifulSoup(Tag):
mostRecentTag = None
for i in range(len(self.tagStack) - 1, 0, -1):
- if name == self.tagStack[i].name:
+ if (name == self.tagStack[i].name
+ and nsprefix == self.tagStack[i].nsprefix == nsprefix):
numPops = len(self.tagStack) - i
break
if not inclusivePop:
@@ -299,10 +300,10 @@ class BeautifulSoup(Tag):
self.pushTag(tag)
return tag
- def handle_endtag(self, name):
+ def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name
self.endData()
- self._popToTag(name)
+ self._popToTag(name, nsprefix)
def handle_data(self, data):
self.currentData.append(data)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 870d59e..e5e30d4 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -106,7 +106,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
- self.soup.handle_endtag(name)
+ namespace, name = self._getNsTag(name)
+ nsprefix = None
+ if namespace is not None:
+ for inverted_nsmap in reversed(self.nsmaps):
+ if inverted_nsmap is not None and namespace in inverted_nsmap:
+ nsprefix = inverted_nsmap[namespace]
+ break
+ self.soup.handle_endtag(name, nsprefix)
if self.nsmaps != None:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
diff --git a/bs4/element.py b/bs4/element.py
index c2c4e2e..efc6ec7 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -25,7 +25,10 @@ def _alias(attr):
class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None):
- obj = unicode.__new__(cls, prefix + ":" + name)
+ if name is None:
+ obj = unicode.__new__(cls, prefix)
+ else:
+ obj = unicode.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
obj.name = name
obj.namespace = namespace
@@ -510,7 +513,7 @@ class Doctype(NavigableString):
return Doctype(value)
PREFIX = u'<!DOCTYPE '
- SUFFIX = u'>'
+ SUFFIX = u'>\n'
class Tag(PageElement):
diff --git a/bs4/testing.py b/bs4/testing.py
index 1945c02..6f9d857 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -371,6 +371,17 @@ class XMLTreeBuilderSmokeTest(object):
soup.encode("latin1"),
b'<?xml version="1.0" encoding="latin1">\n<root/>')
+ def test_real_xhtml_document(self):
+ """A real XHTML document should come out the same as it went in."""
+ markup = b"""<?xml version="1.0" encoding="utf-8">
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+ soup = self.soup(markup)
+ self.assertEqual(soup.encode("utf-8"), markup)
+
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
self.assertSoupEquals("<p>", "<p/>")
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 8333ad4..33ab0fa 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -240,6 +240,10 @@ class TestUnicodeDammit(unittest.TestCase):
class TestNamedspacedAttribute(SoupTest):
+ def test_name_may_be_none(self):
+ a = NamespacedAttribute("xmlns", None)
+ self.assertEqual(a, "xmlns")
+
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index ce9a7ec..c75b561 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -18,7 +18,13 @@ from bs4.builder import (
builder_registry,
HTMLParserTreeBuilder,
)
-from bs4.element import CData, NavigableString, SoupStrainer, Tag
+from bs4.element import (
+ CData,
+ Doctype,
+ NavigableString,
+ SoupStrainer,
+ Tag,
+)
from bs4.testing import (
SoupTest,
skipIf,
@@ -1277,3 +1283,12 @@ class TestNavigableStringSubclasses(SoupTest):
self.assertEqual(str(soup), "<![CDATA[foo]]>")
self.assertEqual(soup.find(text="foo"), "foo")
self.assertEqual(soup.contents[0], "foo")
+
+ def test_doctype_ends_in_newline(self):
+ # Unlike other NavigableString subclasses, a DOCTYPE always ends
+ # in a newline.
+ doctype = Doctype("foo")
+ soup = self.soup("")
+ soup.insert(1, doctype)
+ self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
+