summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG6
-rw-r--r--bs4/__init__.py14
-rw-r--r--bs4/testing.py24
3 files changed, 43 insertions, 1 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d50ff6f..c6babac 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+= Unreleased
+
+* Fixed a bug that caused too many tags to be popped from the tag
+ stack during tree building, when encountering a closing tag that had
+ no matching opening tag. [bug=1880420]
+
= 4.9.1 (20200517)
* Added a keyword argument 'on_duplicate_attribute' to the
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 980c0ce..a2c839c 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -22,6 +22,7 @@ __license__ = "MIT"
__all__ = ['BeautifulSoup']
+from collections import Counter
import os
import re
import sys
@@ -444,6 +445,7 @@ class BeautifulSoup(Tag):
self.current_data = []
self.currentTag = None
self.tagStack = []
+ self.open_tag_counter = Counter()
self.preserve_whitespace_tag_stack = []
self.string_container_stack = []
self.pushTag(self)
@@ -509,6 +511,8 @@ class BeautifulSoup(Tag):
def popTag(self):
"""Internal method called by _popToTag when a tag is closed."""
tag = self.tagStack.pop()
+ if tag.name in self.open_tag_counter:
+ self.open_tag_counter[tag.name] -= 1
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
if self.string_container_stack and tag == self.string_container_stack[-1]:
@@ -525,6 +529,8 @@ class BeautifulSoup(Tag):
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
+ if tag.name != self.ROOT_TAG_NAME:
+ self.open_tag_counter[tag.name] += 1
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
if tag.name in self.builder.string_containers:
@@ -635,13 +641,17 @@ class BeautifulSoup(Tag):
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
- instance of the given tag.
+ instance of the given tag.
+
+ If there are no open tags with the given name, nothing will be
+ popped.
:param name: Pop up to the most recent tag with this name.
:param nsprefix: The namespace prefix that goes with `name`.
:param inclusivePop: It this is false, pops the tag stack up
to but *not* including the most recent instqance of the
given tag.
+
"""
#print("Popping to %s" % name)
if name == self.ROOT_TAG_NAME:
@@ -652,6 +662,8 @@ class BeautifulSoup(Tag):
stack_size = len(self.tagStack)
for i in range(stack_size - 1, 0, -1):
+ if not self.open_tag_counter.get(name):
+ break
t = self.tagStack[i]
if (name == t.name and nsprefix == t.prefix):
if inclusivePop:
diff --git a/bs4/testing.py b/bs4/testing.py
index 660cccb..a2f83a1 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -86,8 +86,22 @@ class SoupTest(unittest.TestCase):
if compare_parsed_to is None:
compare_parsed_to = to_parse
+ # Verify that the documents come out the same.
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+ # Also run some checks on the BeautifulSoup object itself:
+
+ # Verify that every tag that was opened was eventually closed.
+
+ # There are no tags in the open tag counter.
+ assert all(v==0 for v in obj.open_tag_counter.values())
+
+ # The only tag in the tag stack is the one for the root
+ # document.
+ self.assertEqual(
+ [obj.ROOT_TAG_NAME], [x.name for x in obj.tagStack]
+ )
+
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
@@ -850,6 +864,16 @@ Hello, world!
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+ def test_closing_tag_with_no_opening_tag(self):
+ # Without BeautifulSoup.open_tag_counter, the </span> tag will
+ # cause _popToTag to be called over and over again as we look
+ # for a <span> tag that wasn't there. The result is that 'text2'
+ # will show up outside the body of the document.
+ soup = self.soup("<body><div><p>text1</p></span>text2</div></body>")
+ self.assertEqual(
+ "<body><div><p>text1</p>text2</div></body>", soup.body.decode()
+ )
+
def test_worst_case(self):
"""Test the worst case (currently) for linking issues."""