summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIsaac Muse <isaacmuse@gmail.com>2018-12-25 15:25:24 -0700
committerIsaac Muse <isaacmuse@gmail.com>2018-12-25 15:25:24 -0700
commit20a03839bdfa379ad2113f5388bb489ffa72c4d8 (patch)
treed926cf3e18ef749c378296c0dbc96ec79286c3ad
parent3c3b6467d8c96042d1292c14a9b6d4518a298b6b (diff)
Ensure html5lib always has valid internal linkage
html5lib, with malformed HTML, can end up with detached linkage internally. Improve the current code to ensure html5lib always has proper linkage.
-rw-r--r--bs4/__init__.py143
-rw-r--r--bs4/testing.py167
2 files changed, 266 insertions, 44 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index ea9d9eb..e29d28e 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -442,53 +442,108 @@ class BeautifulSoup(Tag):
self._most_recent_element = o
parent.contents.append(o)
- if parent.next_sibling is not None:
- # This node is being inserted into an element that has
- # already been parsed. Deal with any dangling references.
- index = len(parent.contents)-1
- while index >= 0:
- if parent.contents[index] is o:
- break
- index -= 1
+ # Check if we are inserting into an already parsed node.
+ if parent.next_element is not None:
+
+ # Check that links are proper across tag parent boundaries
+ child = self._linkage_fixer(parent)
+
+ def _linkage_fixer(self, el, _recursive_call=False):
+ """Make sure linkage of this fragment is sound."""
+ descendant = None
+
+ # If element is document element,
+ # it should have no previous element, previous sibling, or next sibling.
+ if el.parent is None:
+ if el.previous_element is not None:
+ el.previous_element = None
+ if el.previous_sibling is not None:
+ el.previous_element = None
+ if el.next_sibling is not None:
+ el.next_sibling = None
+
+ idx = 0
+ child = None
+ last_child = None
+ last_idx = len(el.contents) - 1
+ for child in el.contents:
+ descendant = None
+
+ # Parent should link next element to their first child
+ # That child should have no previous sibling
+ if idx == 0:
+ if el.parent is not None:
+ if el.next_element is not child:
+ el.next_element = child
+
+ if child.previous_element is not el:
+ child.previous_element = el
+
+ if child.previous_sibling is not None:
+ child.previous_sibling = None
+
+ # If not the first child, previous index should link as sibling to last index.
+ # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
else:
- raise ValueError(
- "Error building tree: supposedly %r was inserted "
- "into %r after the fact, but I don't see it!" % (
- o, parent
- )
- )
- if index == 0:
- previous_element = parent
- previous_sibling = None
+ if child.previous_sibling is not el.contents[idx - 1]:
+ child.previous_sibling = el.contents[idx - 1]
+ if el.contents[idx - 1].next_sibling is not child:
+ el.contents[idx - 1].next_sibling = child
+
+ if last_child is not None:
+ if child.previous_element is not last_child:
+ child.previous_element = last_child
+ if last_child.next_element is not child:
+ last_child.next_element = child
+
+ # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
+ if isinstance(child, Tag) and child.contents:
+ descendant = self._linkage_fixer(child, True)
+ # A bubbled up descendant should have no next siblings
+ # as it is last in its content list.
+ if descendant.next_sibling is not None:
+ descendant.next_sibling = None
+
+ # Mark last child as either the bubbled up descendant or the current child
+ if descendant is not None:
+ last_child = descendant
else:
- previous_element = previous_sibling = parent.contents[index-1]
- previous = previous_element
- while isinstance(previous, Tag):
- if previous.contents:
- previous.next_element = previous.contents[0]
- previous = previous.contents[-1]
- else:
- break
- previous_element = previous
+ last_child = child
+
+ # If last child in list, there are no next siblings
+ if idx == last_idx:
+ if child.next_sibling is not None:
+ child.next_sibling = None
+ idx += 1
+
+ # The child to return is either the last descendant (if available)
+ # or the last processed child (if any). If neither is available,
+ # the parent element is its own last descendant.
+ child = descendant if descendant is not None else child
+ if child is None:
+ child = el
+
+ # If not a recursive call, we are done processing this element.
+ # As the final step, link last descendant. It should be linked
+ # to the parent's next sibling (if found), else walk up the chain
+ # and find a parent with a sibling.
+ if not _recursive_call and child is not None:
+ child.next_element = None
+ target = el
+ while True:
+ if target is None:
+ break
+ elif target.next_sibling is not None:
+ child.next_element = target.next_sibling
+ target.next_sibling.previous_element = child
+ break
+ target = target.parent
- if index == len(parent.contents)-1:
- next_element = parent.next_sibling
- next_sibling = None
- else:
- next_element = next_sibling = parent.contents[index+1]
-
- o.previous_element = previous_element
- if previous_element is not None:
- previous_element.next_element = o
- o.next_element = next_element
- if next_element is not None:
- next_element.previous_element = o
- o.next_sibling = next_sibling
- if next_sibling is not None:
- next_sibling.previous_sibling = o
- o.previous_sibling = previous_sibling
- if previous_sibling is not None:
- previous_sibling.next_sibling = o
+ # We are done, so nothing to return
+ return None
+ else:
+ # Return the child to the recursive caller
+ return child
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
diff --git a/bs4/testing.py b/bs4/testing.py
index 745a9c4..18b985c 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -17,11 +17,48 @@ from bs4.element import (
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
+ Tag
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
+BAD_DOCUMENT = u"""A bare string
+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
+<div>A <meta> tag</div>
+<div>A <br> tag that supposedly has contents.</br></div>
+<div>AT&T</div>
+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
+<div><a href="http://example.com/</a> that attribute value never got closed</div>
+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
+<! This document starts with a bogus declaration ><div>a</div>
+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
+<div>This document ends with <!an incomplete declaration
+<div><a style={height:21px;}>That attribute value was bogus</a></div>
+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
+<div>This document ends before the entity finishes: &gt
+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
+<div><table><tr><td>Here's a table</td></tr></table></div>
+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
+<div>This tag contains nothing but whitespace: <b> </b></div>
+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
+<div><table><div>This table contains bare markup</div></table></div>
+<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
+<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
+<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+"""
+
class SoupTest(unittest.TestCase):
@@ -60,6 +97,123 @@ class SoupTest(unittest.TestCase):
self.assertEqual(earlier, e.previous_element)
earlier = e
+ def linkage_validator(self, el, _recursive_call=False):
+ """Ensure proper linkage throughout the document."""
+ descendant = None
+ # Document element should have no previous element or previous sibling.
+ # It also shouldn't have a next sibling.
+ if el.parent is None:
+ assert el.previous_element is None,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_element, None
+ )
+ assert el.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_sibling, None
+ )
+ assert el.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_sibling, None
+ )
+
+ idx = 0
+ child = None
+ last_child = None
+ last_idx = len(el.contents) - 1
+ for child in el.contents:
+ descendant = None
+
+ # Parent should link next element to their first child
+ # That child should have no previous sibling
+ if idx == 0:
+ if el.parent is not None:
+ assert el.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_element, child
+ )
+ assert child.previous_element is el,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ child, child.previous_element, el
+ )
+ assert child.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
+ child, child.previous_sibling, None
+ )
+
+ # If not the first child, previous index should link as sibling to this index
+ # Previous element should match the last index or the last bubbled up descendant
+ else:
+ assert child.previous_sibling is el.contents[idx - 1],\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
+ child, child.previous_sibling, el.contents[idx - 1]
+ )
+ assert el.contents[idx - 1].next_sibling is child,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
+ )
+
+ if last_child is not None:
+ assert child.previous_element is last_child,\
+ "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
+ child, child.previous_element, last_child, child.parent.contents
+ )
+ assert last_child.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ last_child, last_child.next_element, child
+ )
+
+ if isinstance(child, Tag) and child.contents:
+ descendant = self.linkage_validator(child, True)
+ # A bubbled up descendant should have no next siblings
+ assert descendant.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ descendant, descendant.next_sibling, None
+ )
+
+ # Mark last child as either the bubbled up descendant or the current child
+ if descendant is not None:
+ last_child = descendant
+ else:
+ last_child = child
+
+ # If last child, there are non next siblings
+ if idx == last_idx:
+ assert child.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_sibling, None
+ )
+ idx += 1
+
+ child = descendant if descendant is not None else child
+ if child is None:
+ child = el
+
+ if not _recursive_call and child is not None:
+ target = el
+ while True:
+ if target is None:
+ assert child.next_element is None, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, None
+ )
+ break
+ elif target.next_sibling is not None:
+ assert child.next_element is target.next_sibling, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, target.next_sibling
+ )
+ break
+ target = target.parent
+
+ # We are done, so nothing to return
+ return None
+ else:
+ # Return the child to the recursive caller
+ return child
+
+ return descendant if descendant is not None else child
+
+
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
@@ -615,6 +769,13 @@ Hello, world!
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
+
class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
@@ -761,6 +922,12 @@ class XMLTreeBuilderSmokeTest(object):
# The two tags have the same namespace prefix.
self.assertEqual(tag.prefix, duplicate.prefix)
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""