diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-05-30 15:03:49 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-05-30 15:03:49 -0400 |
commit | cd2a59fbd92fd78cde913b19bc14c2fda6122750 (patch) | |
tree | caeee46a372e7f1e23c6d9630407ea18e4aab012 | |
parent | b80c4201fc48617742c49411278b1853fb779a35 (diff) |
Fixed a bug that caused too many tags to be popped from the tag
stack during tree building, when encountering a closing tag that had
no matching opening tag. [bug=1880420]
-rw-r--r-- | CHANGELOG | 6 | ||||
-rw-r--r-- | bs4/__init__.py | 14 | ||||
-rw-r--r-- | bs4/testing.py | 24 |
3 files changed, 43 insertions, 1 deletions
@@ -1,3 +1,9 @@ += Unreleased + +* Fixed a bug that caused too many tags to be popped from the tag + stack during tree building, when encountering a closing tag that had + no matching opening tag. [bug=1880420] + = 4.9.1 (20200517) * Added a keyword argument 'on_duplicate_attribute' to the diff --git a/bs4/__init__.py b/bs4/__init__.py index 980c0ce..a2c839c 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -22,6 +22,7 @@ __license__ = "MIT" __all__ = ['BeautifulSoup'] +from collections import Counter import os import re import sys @@ -444,6 +445,7 @@ class BeautifulSoup(Tag): self.current_data = [] self.currentTag = None self.tagStack = [] + self.open_tag_counter = Counter() self.preserve_whitespace_tag_stack = [] self.string_container_stack = [] self.pushTag(self) @@ -509,6 +511,8 @@ class BeautifulSoup(Tag): def popTag(self): """Internal method called by _popToTag when a tag is closed.""" tag = self.tagStack.pop() + if tag.name in self.open_tag_counter: + self.open_tag_counter[tag.name] -= 1 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: self.preserve_whitespace_tag_stack.pop() if self.string_container_stack and tag == self.string_container_stack[-1]: @@ -525,6 +529,8 @@ class BeautifulSoup(Tag): self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] + if tag.name != self.ROOT_TAG_NAME: + self.open_tag_counter[tag.name] += 1 if tag.name in self.builder.preserve_whitespace_tags: self.preserve_whitespace_tag_stack.append(tag) if tag.name in self.builder.string_containers: @@ -635,13 +641,17 @@ class BeautifulSoup(Tag): def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent - instance of the given tag. + instance of the given tag. + + If there are no open tags with the given name, nothing will be + popped. :param name: Pop up to the most recent tag with this name. :param nsprefix: The namespace prefix that goes with `name`. :param inclusivePop: It this is false, pops the tag stack up to but *not* including the most recent instqance of the given tag. + """ #print("Popping to %s" % name) if name == self.ROOT_TAG_NAME: @@ -652,6 +662,8 @@ class BeautifulSoup(Tag): stack_size = len(self.tagStack) for i in range(stack_size - 1, 0, -1): + if not self.open_tag_counter.get(name): + break t = self.tagStack[i] if (name == t.name and nsprefix == t.prefix): if inclusivePop: diff --git a/bs4/testing.py b/bs4/testing.py index 660cccb..a2f83a1 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -86,8 +86,22 @@ class SoupTest(unittest.TestCase): if compare_parsed_to is None: compare_parsed_to = to_parse + # Verify that the documents come out the same. self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + # Also run some checks on the BeautifulSoup object itself: + + # Verify that every tag that was opened was eventually closed. + + # There are no tags in the open tag counter. + assert all(v==0 for v in obj.open_tag_counter.values()) + + # The only tag in the tag stack is the one for the root + # document. + self.assertEqual( + [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack] + ) + def assertConnectedness(self, element): """Ensure that next_element and previous_element are properly set for all descendants of the given element. @@ -850,6 +864,16 @@ Hello, world! data.a['foo'] = 'bar' self.assertEqual('<a foo="bar">text</a>', data.a.decode()) + def test_closing_tag_with_no_opening_tag(self): + # Without BeautifulSoup.open_tag_counter, the </span> tag will + # cause _popToTag to be called over and over again as we look + # for a <span> tag that wasn't there. The result is that 'text2' + # will show up outside the body of the document. + soup = self.soup("<body><div><p>text1</p></span>text2</div></body>") + self.assertEqual( + "<body><div><p>text1</p>text2</div></body>", soup.body.decode() + ) + def test_worst_case(self): """Test the worst case (currently) for linking issues.""" |