summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/builder/_html5lib.py6
-rw-r--r--bs4/testing.py10
3 files changed, 19 insertions, 0 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 45e4e4d..9e39967 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
= 4.3.3 (Unreleased) =
+* Fixed yet another problem that caused the html5lib tree builder to
+ create a disconnected parse tree. [bug=1237763]
+
* Restored the helpful syntax error that happens when you try to
import the Python 2 edition of Beautiful Soup under Python
3. [bug=1213387]
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 7de36ae..6446c2e 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -161,6 +161,12 @@ class Element(html5lib.treebuilders._base.Node):
# immediately after the parent, if it has no children.)
if self.element.contents:
most_recent_element = self.element._last_descendant(False)
+ elif self.element.next_element is not None:
+ # Something from further ahead in the parse tree is
+ # being inserted into this earlier element. This is
+ # very annoying because it means an expensive search
+ # for the last element in the tree.
+ most_recent_element = self.soup._last_descendant()
else:
most_recent_element = self.element
diff --git a/bs4/testing.py b/bs4/testing.py
index fd4495a..ce207cf 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -254,6 +254,16 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
+ def test_head_tag_between_head_and_body(self):
+ "Prevent recurrence of a bug in the html5lib treebuilder."
+ content = """<html><head></head>
+ <link></link>
+ <body>foo</body>
+</html>
+"""
+ soup = self.soup(content)
+ self.assertNotEqual(None, soup.html.body)
+
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
very least they should not choke on namespaces or lose