Ensure html5lib always has valid internal linkage

html5lib, with malformed HTML, can end up with detached linkage internally. Improve the current code to ensure html5lib always has proper linkage.
author: Isaac Muse <isaacmuse@gmail.com> 2018-12-25 15:25:24 -0700
committer: Isaac Muse <isaacmuse@gmail.com> 2018-12-25 15:25:24 -0700
commit: 20a03839bdfa379ad2113f5388bb489ffa72c4d8 (patch)
tree: d926cf3e18ef749c378296c0dbc96ec79286c3ad
parent: 3c3b6467d8c96042d1292c14a9b6d4518a298b6b (diff)
2 files changed, 266 insertions, 44 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index ea9d9eb..e29d28e 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -442,53 +442,108 @@ class BeautifulSoup(Tag):
         self._most_recent_element = o
         parent.contents.append(o)
 
-        if parent.next_sibling is not None:
-            # This node is being inserted into an element that has
-            # already been parsed. Deal with any dangling references.
-            index = len(parent.contents)-1
-            while index >= 0:
-                if parent.contents[index] is o:
-                    break
-                index -= 1
+        # Check if we are inserting into an already parsed node.
+        if parent.next_element is not None:
+
+            # Check that links are proper across tag parent boundaries
+            child = self._linkage_fixer(parent)
+
+    def _linkage_fixer(self, el, _recursive_call=False):
+        """Make sure linkage of this fragment is sound."""
+        descendant = None
+
+        # If element is document element,
+        # it should have no previous element, previous sibling, or next sibling.
+        if el.parent is None:
+            if el.previous_element is not None:
+                el.previous_element = None
+            if el.previous_sibling is not None:
+                el.previous_element = None
+            if el.next_sibling is not None:
+                el.next_sibling = None
+
+        idx = 0
+        child = None
+        last_child = None
+        last_idx = len(el.contents) - 1
+        for child in el.contents:
+            descendant = None
+
+            # Parent should link next element to their first child
+            # That child should have no previous sibling
+            if idx == 0:
+                if el.parent is not None:
+                    if el.next_element is not child:
+                        el.next_element = child
+
+                    if child.previous_element is not el:
+                        child.previous_element = el
+
+                    if child.previous_sibling is not None:
+                        child.previous_sibling = None
+
+            # If not the first child, previous index should link as sibling to last index.
+            # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
             else:
-                raise ValueError(
-                    "Error building tree: supposedly %r was inserted "
-                    "into %r after the fact, but I don't see it!" % (
-                        o, parent
-                    )
-                )
-            if index == 0:
-                previous_element = parent
-                previous_sibling = None
+                if child.previous_sibling is not el.contents[idx - 1]:
+                    child.previous_sibling = el.contents[idx - 1]
+                if el.contents[idx - 1].next_sibling is not child:
+                    el.contents[idx - 1].next_sibling = child
+
+                if last_child is not None:
+                    if child.previous_element is not last_child:
+                        child.previous_element = last_child
+                    if last_child.next_element is not child:
+                        last_child.next_element = child
+
+            # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
+            if isinstance(child, Tag) and child.contents:
+                descendant = self._linkage_fixer(child, True)
+                # A bubbled up descendant should have no next siblings
+                # as it is last in its content list.
+                if descendant.next_sibling is not None:
+                    descendant.next_sibling = None
+
+            # Mark last child as either the bubbled up descendant or the current child
+            if descendant is not None:
+                last_child = descendant
             else:
-                previous_element = previous_sibling = parent.contents[index-1]
-                previous = previous_element
-                while isinstance(previous, Tag):
-                    if previous.contents:
-                        previous.next_element = previous.contents[0]
-                        previous = previous.contents[-1]
-                    else:
-                        break
-                previous_element = previous
+                last_child = child
+
+            # If last child in list, there are no next siblings
+            if idx == last_idx:
+                if child.next_sibling is not None:
+                    child.next_sibling = None
+            idx += 1
+
+        # The child to return is either the last descendant (if available)
+        # or the last processed child (if any). If neither is available,
+        # the parent element is its own last descendant.
+        child = descendant if descendant is not None else child
+        if child is None:
+            child = el
+
+        # If not a recursive call, we are done processing this element.
+        # As the final step, link last descendant. It should be linked
+        # to the parent's next sibling (if found), else walk up the chain
+        # and find a parent with a sibling.
+        if not _recursive_call and child is not None:
+            child.next_element = None
+            target = el
+            while True:
+                if target is None:
+                    break
+                elif target.next_sibling is not None:
+                    child.next_element = target.next_sibling
+                    target.next_sibling.previous_element = child
+                    break
+                target = target.parent
 
-            if index == len(parent.contents)-1:
-                next_element = parent.next_sibling
-                next_sibling = None
-            else:
-                next_element = next_sibling = parent.contents[index+1]
-
-            o.previous_element = previous_element
-            if previous_element is not None:
-                previous_element.next_element = o
-            o.next_element = next_element
-            if next_element is not None:
-                next_element.previous_element = o
-            o.next_sibling = next_sibling
-            if next_sibling is not None:
-                next_sibling.previous_sibling = o
-            o.previous_sibling = previous_sibling
-            if previous_sibling is not None:
-                previous_sibling.next_sibling = o
+            # We are done, so nothing to return
+            return None
+        else:
+            # Return the child to the recursive caller
+            return child
 
     def _popToTag(self, name, nsprefix=None, inclusivePop=True):
         """Pops the tag stack up to and including the most recent
diff --git a/bs4/testing.py b/bs4/testing.py
index 745a9c4..18b985c 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -17,11 +17,48 @@ from bs4.element import (
     ContentMetaAttributeValue,
     Doctype,
     SoupStrainer,
+    Tag
 )
 
 from bs4.builder import HTMLParserTreeBuilder
 default_builder = HTMLParserTreeBuilder
 
+BAD_DOCUMENT = u"""A bare string
+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
+<div>A <meta> tag</div>
+<div>A <br> tag that supposedly has contents.</br></div>
+<div>AT&T</div>
+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
+<div><a href="http://example.com/</a> that attribute value never got closed</div>
+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
+<! This document starts with a bogus declaration ><div>a</div>
+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
+<div>This document ends with <!an incomplete declaration
+<div><a style={height:21px;}>That attribute value was bogus</a></div>
+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
+<div>This document ends before the entity finishes: &gt
+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
+<div><table><tr><td>Here's a table</td></tr></table></div>
+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
+<div>This tag contains nothing but whitespace: <b>    </b></div>
+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
+<div><table><div>This table contains bare markup</div></table></div>
+<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n   <a href="link2">This link is closed.</a>\n  </div>\n</div></div>
+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
+<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
+<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+"""
+
 
 class SoupTest(unittest.TestCase):
 
@@ -60,6 +97,123 @@ class SoupTest(unittest.TestCase):
                 self.assertEqual(earlier, e.previous_element)
             earlier = e
 
+    def linkage_validator(self, el, _recursive_call=False):
+        """Ensure proper linkage throughout the document."""
+        descendant = None
+        # Document element should have no previous element or previous sibling.
+        # It also shouldn't have a next sibling.
+        if el.parent is None:
+            assert el.previous_element is None,\
+                "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+                    el, el.previous_element, None
+                )
+            assert el.previous_sibling is None,\
+                "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+                    el, el.previous_sibling, None
+                )
+            assert el.next_sibling is None,\
+                "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+                    el, el.next_sibling, None
+                )
+
+        idx = 0
+        child = None
+        last_child = None
+        last_idx = len(el.contents) - 1
+        for child in el.contents:
+            descendant = None
+
+            # Parent should link next element to their first child
+            # That child should have no previous sibling
+            if idx == 0:
+                if el.parent is not None:
+                    assert el.next_element is child,\
+                       "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+                            el, el.next_element, child
+                        )
+                    assert child.previous_element is el,\
+                       "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+                            child, child.previous_element, el
+                        )
+                    assert child.previous_sibling is None,\
+                       "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
+                            child, child.previous_sibling, None
+                        )
+
+            # If not the first child, previous index should link as sibling to this index
+            # Previous element should match the last index or the last bubbled up descendant
+            else:
+                assert child.previous_sibling is el.contents[idx - 1],\
+                    "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
+                        child, child.previous_sibling, el.contents[idx - 1]
+                    )
+                assert el.contents[idx - 1].next_sibling is child,\
+                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+                        el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
+                    )
+
+                if last_child is not None:
+                    assert child.previous_element is last_child,\
+                        "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
+                            child, child.previous_element, last_child, child.parent.contents
+                        )
+                    assert last_child.next_element is child,\
+                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+                            last_child, last_child.next_element, child
+                        )
+
+            if isinstance(child, Tag) and child.contents:
+                descendant = self.linkage_validator(child, True)
+                # A bubbled up descendant should have no next siblings
+                assert descendant.next_sibling is None,\
+                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+                        descendant, descendant.next_sibling, None
+                    )
+
+            # Mark last child as either the bubbled up descendant or the current child
+            if descendant is not None:
+                last_child = descendant
+            else:
+                last_child = child
+
+            # If last child, there are non next siblings
+            if idx == last_idx:
+                assert child.next_sibling is None,\
+                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+                        child, child.next_sibling, None
+                    )
+            idx += 1
+
+        child = descendant if descendant is not None else child
+        if child is None:
+            child = el
+
+        if not _recursive_call and child is not None:
+            target = el
+            while True:
+                if target is None:
+                    assert child.next_element is None, \
+                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+                            child, child.next_element, None
+                        )
+                    break
+                elif target.next_sibling is not None:
+                    assert child.next_element is target.next_sibling, \
+                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+                            child, child.next_element, target.next_sibling
+                        )
+                    break
+                target = target.parent
+
+            # We are done, so nothing to return
+            return None
+        else:
+            # Return the child to the recursive caller
+            return child
+
+        return descendant if descendant is not None else child
+
+
 class HTMLTreeBuilderSmokeTest(object):
 
     """A basic test of a treebuilder's competence.
@@ -615,6 +769,13 @@ Hello, world!
         data.a['foo'] = 'bar'
         self.assertEqual('<a foo="bar">text</a>', data.a.decode())
 
+    def test_worst_case(self):
+        """Test the worst case (currently) for linking issues."""
+
+        soup = self.soup(BAD_DOCUMENT)
+        self.linkage_validator(soup)
+
+
 class XMLTreeBuilderSmokeTest(object):
 
     def test_pickle_and_unpickle_identity(self):
@@ -761,6 +922,12 @@ class XMLTreeBuilderSmokeTest(object):
         # The two tags have the same namespace prefix.
         self.assertEqual(tag.prefix, duplicate.prefix)
 
+    def test_worst_case(self):
+        """Test the worst case (currently) for linking issues."""
+
+        soup = self.soup(BAD_DOCUMENT)
+        self.linkage_validator(soup)
+
 
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
     """Smoke test for a tree builder that supports HTML5."""
author	Isaac Muse <isaacmuse@gmail.com>	2018-12-25 15:25:24 -0700
committer	Isaac Muse <isaacmuse@gmail.com>	2018-12-25 15:25:24 -0700
commit	20a03839bdfa379ad2113f5388bb489ffa72c4d8 (patch)
tree	d926cf3e18ef749c378296c0dbc96ec79286c3ad
parent	3c3b6467d8c96042d1292c14a9b6d4518a298b6b (diff)