diff options
-rw-r--r-- | bs4/__init__.py | 8 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 16 | ||||
-rw-r--r-- | bs4/testing.py | 12 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 14 |
4 files changed, 47 insertions, 3 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 80f6f68..fe2656b 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -245,13 +245,15 @@ class BeautifulSoup(Tag): o = containerClass(currentData) self.object_was_parsed(o) - def object_was_parsed(self, o): + def object_was_parsed(self, o, parent=None, previous_element=None): """Add an object to the parse tree.""" - o.setup(self.currentTag, self.previous_element) + parent = parent or self.currentTag + previous_element = previous_element or self.previous_element + o.setup(parent, previous_element) if self.previous_element: self.previous_element.next_element = o self.previous_element = o - self.currentTag.contents.append(o) + parent.contents.append(o) def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 6001e38..29650a6 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -72,6 +72,22 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) + def insertComment(self, token, parent=None): + comment = Comment(token['data']) + parent = parent or self.soup + # We can't rely on self.soup.previousElement, because this + # comment may have been parsed a long time ago and inserted. + if parent is None: + parent = self.soup.currentTag + else: + parent = parent.element + if len(parent.contents) > 0: + previous_element = parent.contents[-1] + else: + previous_element = parent + + self.soup.object_was_parsed(comment, parent, previous_element) + def elementClass(self, name, namespace): tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) diff --git a/bs4/testing.py b/bs4/testing.py index 30e74f4..0f052eb 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -159,6 +159,12 @@ class HTMLTreeBuilderSmokeTest(object): comment = soup.find(text="foobar") self.assertEqual(comment.__class__, Comment) + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEquals(comment, baz.previous_element) + def test_preserved_whitespace_in_pre_and_textarea(self): """Whitespace must be preserved in <pre> and <textarea> tags.""" self.assertSoupEquals("<pre> </pre>") @@ -523,6 +529,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.msqrt.namespace) + def test_xml_declaration_becomes_comment(self): + markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) def skipIf(condition, reason): def nothing(test, *args, **kwargs): diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index f195f7d..0e1c1d8 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -56,3 +56,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): "<table><thead><tr><td>Foo</td></tr></thead>" "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + + def test_xml_declaration_followed_by_doctype(self): + markup = '''<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html> +<html> + <head> + </head> + <body> + <p>foo</p> + </body> +</html>''' + soup = self.soup(markup) + # Verify that we can reach the <p> tag; this means the tree is connected. + self.assertEquals("<p>foo</p>", soup.p.encode()) |