diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-23 12:23:12 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-23 12:23:12 -0500 |
commit | fcefebe15290b9ff44934efa73fb07c70ebf5171 (patch) | |
tree | c0b3ae8837a96975e1b88f3e2e9befc07a72e70c | |
parent | b7749c50a2c96ccf6982cfa1ca02d883e31e0af9 (diff) |
Fixed handling of the closing of namespaced tags.
-rw-r--r-- | NEWS.txt | 2 | ||||
-rw-r--r-- | bs4/__init__.py | 9 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 9 | ||||
-rw-r--r-- | bs4/element.py | 7 | ||||
-rw-r--r-- | bs4/testing.py | 11 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 4 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 17 |
7 files changed, 51 insertions, 8 deletions
@@ -8,6 +8,8 @@ to use namespace prefixes exactly as they're used in the original document. +* The string representation of a DOCTYPE always ends in a newline. + = 4.0.0b7 (20110223) = * Upon decoding to string, any characters that can't be represented in diff --git a/bs4/__init__.py b/bs4/__init__.py index bf800ea..9b5c155 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -249,7 +249,7 @@ class BeautifulSoup(Tag): self.previous_element = o self.currentTag.contents.append(o) - def _popToTag(self, name, inclusivePop=True): + def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of @@ -262,7 +262,8 @@ class BeautifulSoup(Tag): mostRecentTag = None for i in range(len(self.tagStack) - 1, 0, -1): - if name == self.tagStack[i].name: + if (name == self.tagStack[i].name + and nsprefix == self.tagStack[i].nsprefix == nsprefix): numPops = len(self.tagStack) - i break if not inclusivePop: @@ -299,10 +300,10 @@ class BeautifulSoup(Tag): self.pushTag(tag) return tag - def handle_endtag(self, name): + def handle_endtag(self, name, nsprefix=None): #print "End tag: " + name self.endData() - self._popToTag(name) + self._popToTag(name, nsprefix) def handle_data(self, data): self.currentData.append(data) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 870d59e..e5e30d4 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -106,7 +106,14 @@ class LXMLTreeBuilderForXML(TreeBuilder): def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] - self.soup.handle_endtag(name) + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) if self.nsmaps != None: # This tag, or one of its parents, introduced a namespace # mapping, so pop it off the stack. diff --git a/bs4/element.py b/bs4/element.py index c2c4e2e..efc6ec7 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -25,7 +25,10 @@ def _alias(attr): class NamespacedAttribute(unicode): def __new__(cls, prefix, name, namespace=None): - obj = unicode.__new__(cls, prefix + ":" + name) + if name is None: + obj = unicode.__new__(cls, prefix) + else: + obj = unicode.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace @@ -510,7 +513,7 @@ class Doctype(NavigableString): return Doctype(value) PREFIX = u'<!DOCTYPE ' - SUFFIX = u'>' + SUFFIX = u'>\n' class Tag(PageElement): diff --git a/bs4/testing.py b/bs4/testing.py index 1945c02..6f9d857 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -371,6 +371,17 @@ class XMLTreeBuilderSmokeTest(object): soup.encode("latin1"), b'<?xml version="1.0" encoding="latin1">\n<root/>') + def test_real_xhtml_document(self): + """A real XHTML document should come out the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): self.assertSoupEquals("<p>", "<p/>") diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 8333ad4..33ab0fa 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -240,6 +240,10 @@ class TestUnicodeDammit(unittest.TestCase): class TestNamedspacedAttribute(SoupTest): + def test_name_may_be_none(self): + a = NamespacedAttribute("xmlns", None) + self.assertEqual(a, "xmlns") + def test_attribute_is_equivalent_to_colon_separated_string(self): a = NamespacedAttribute("a", "b") self.assertEqual("a:b", a) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index ce9a7ec..c75b561 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -18,7 +18,13 @@ from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) -from bs4.element import CData, NavigableString, SoupStrainer, Tag +from bs4.element import ( + CData, + Doctype, + NavigableString, + SoupStrainer, + Tag, +) from bs4.testing import ( SoupTest, skipIf, @@ -1277,3 +1283,12 @@ class TestNavigableStringSubclasses(SoupTest): self.assertEqual(str(soup), "<![CDATA[foo]]>") self.assertEqual(soup.find(text="foo"), "foo") self.assertEqual(soup.contents[0], "foo") + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") + |