summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/__init__.py8
-rw-r--r--bs4/builder/_html5lib.py16
-rw-r--r--bs4/testing.py12
-rw-r--r--bs4/tests/test_html5lib.py14
4 files changed, 47 insertions, 3 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 80f6f68..fe2656b 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -245,13 +245,15 @@ class BeautifulSoup(Tag):
o = containerClass(currentData)
self.object_was_parsed(o)
- def object_was_parsed(self, o):
+ def object_was_parsed(self, o, parent=None, previous_element=None):
"""Add an object to the parse tree."""
- o.setup(self.currentTag, self.previous_element)
+ parent = parent or self.currentTag
+ previous_element = previous_element or self.previous_element
+ o.setup(parent, previous_element)
if self.previous_element:
self.previous_element.next_element = o
self.previous_element = o
- self.currentTag.contents.append(o)
+ parent.contents.append(o)
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 6001e38..29650a6 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -72,6 +72,22 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
self.soup.object_was_parsed(doctype)
+ def insertComment(self, token, parent=None):
+ comment = Comment(token['data'])
+ parent = parent or self.soup
+ # We can't rely on self.soup.previousElement, because this
+ # comment may have been parsed a long time ago and inserted.
+ if parent is None:
+ parent = self.soup.currentTag
+ else:
+ parent = parent.element
+ if len(parent.contents) > 0:
+ previous_element = parent.contents[-1]
+ else:
+ previous_element = parent
+
+ self.soup.object_was_parsed(comment, parent, previous_element)
+
def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
diff --git a/bs4/testing.py b/bs4/testing.py
index 30e74f4..0f052eb 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -159,6 +159,12 @@ class HTMLTreeBuilderSmokeTest(object):
comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment)
+ # The comment is properly integrated into the tree.
+ foo = soup.find(text="foo")
+ self.assertEqual(comment, foo.next_element)
+ baz = soup.find(text="baz")
+ self.assertEquals(comment, baz.previous_element)
+
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags."""
self.assertSoupEquals("<pre> </pre>")
@@ -523,6 +529,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
self.assertEqual(namespace, soup.math.namespace)
self.assertEqual(namespace, soup.msqrt.namespace)
+ def test_xml_declaration_becomes_comment(self):
+ markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
+ soup = self.soup(markup)
+ self.assertTrue(isinstance(soup.contents[0], Comment))
+ self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
+ self.assertEqual("html", soup.contents[0].next_element.name)
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index f195f7d..0e1c1d8 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -56,3 +56,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+ def test_xml_declaration_followed_by_doctype(self):
+ markup = '''<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html>
+<html>
+ <head>
+ </head>
+ <body>
+ <p>foo</p>
+ </body>
+</html>'''
+ soup = self.soup(markup)
+ # Verify that we can reach the <p> tag; this means the tree is connected.
+ self.assertEquals("<p>foo</p>", soup.p.encode())