diff options
author | Leonard Richardson <leonardr@segfault.org> | 2023-04-07 10:31:56 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2023-04-07 10:31:56 -0400 |
commit | 67336dd4cd781a7d27716a5bcaae939e80a7bc24 (patch) | |
tree | 0a345409186c2e6cb0ab312e18feb4bbb16c206b | |
parent | fb8179d217dfb11e81c28076fc3bf14bdf9a0038 (diff) |
Fixed an unhandled exception in BeautifulSoup.decode_contents
and methods that call it. [bug=2015545]
-rw-r--r-- | CHANGELOG | 7 | ||||
-rw-r--r-- | bs4/__init__.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_pageelement.py | 3 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 42 |
4 files changed, 53 insertions, 5 deletions
@@ -1,7 +1,12 @@ += 4.12.2 (20230407) + +* Fixed an unhandled exception in BeautifulSoup.decode_contents + and methods that call it. [bug=2015545] + = 4.12.1 (20230405) NOTE: the following things are likely to be dropped in the next -release of Beautiful Soup: +feature release of Beautiful Soup: Official support for Python 3.6. Inclusion of unit tests and test data in the wheel file. diff --git a/bs4/__init__.py b/bs4/__init__.py index 18d380b..3d2ab09 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.12.1" +__version__ = "4.12.2" __copyright__ = "Copyright (c) 2004-2023 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" @@ -776,7 +776,7 @@ class BeautifulSoup(Tag): def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): + formatter="minimal", iterator=None): """Returns a string or Unicode representation of the parse tree as an HTML or XML document. @@ -803,7 +803,7 @@ class BeautifulSoup(Tag): else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) + indent_level, eventual_encoding, formatter, iterator) # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' _s = BeautifulSoup diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py index e12df79..24f9385 100644 --- a/bs4/tests/test_pageelement.py +++ b/bs4/tests/test_pageelement.py @@ -49,7 +49,7 @@ class TestEncoding(SoupTest): assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents( encoding="utf8" ) - + def test_encode_deeply_nested_document(self): # This test verifies that encoding a string doesn't involve # any recursive function calls. If it did, this test would @@ -63,6 +63,7 @@ class TestEncoding(SoupTest): def test_deprecated_renderContents(self): html = "<b>\N{SNOWMAN}</b>" soup = self.soup(html) + soup.renderContents() assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents() def test_repr(self): diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 64b8cf1..28013b8 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -24,6 +24,7 @@ from bs4.builder import ( from bs4.element import ( Comment, SoupStrainer, + PYTHON_SPECIFIC_ENCODINGS, Tag, NavigableString, ) @@ -210,6 +211,47 @@ class TestConstructor(SoupTest): assert [] == soup.string_container_stack +class TestOutput(SoupTest): + + @pytest.mark.parametrize( + "eventual_encoding,actual_encoding", [ + ("utf-8", "utf-8"), + ("utf-16", "utf-16"), + ] + ) + def test_decode_xml_declaration(self, eventual_encoding, actual_encoding): + # Most of the time, calling decode() on an XML document will + # give you a document declaration that mentions the encoding + # you intend to use when encoding the document as a + # bytestring. + soup = self.soup("<tag></tag>") + soup.is_xml = True + assert (f'<?xml version="1.0" encoding="{actual_encoding}"?>\n<tag></tag>' + == soup.decode(eventual_encoding=eventual_encoding)) + + @pytest.mark.parametrize( + "eventual_encoding", [x for x in PYTHON_SPECIFIC_ENCODINGS] + [None] + ) + def test_decode_xml_declaration_with_missing_or_python_internal_eventual_encoding(self, eventual_encoding): + # But if you pass a Python internal encoding into decode(), or + # omit the eventual_encoding altogether, the document + # declaration won't mention any particular encoding. + soup = BeautifulSoup("<tag></tag>", "html.parser") + soup.is_xml = True + assert (f'<?xml version="1.0"?>\n<tag></tag>' + == soup.decode(eventual_encoding=eventual_encoding)) + + def test(self): + # BeautifulSoup subclasses Tag and extends the decode() method. + # Make sure the other Tag methods which call decode() call + # it correctly. + soup = self.soup("<tag></tag>") + assert b"<tag></tag>" == soup.encode(encoding="utf-8") + assert b"<tag></tag>" == soup.encode_contents(encoding="utf-8") + assert "<tag></tag>" == soup.decode_contents() + assert "<tag>\n</tag>\n" == soup.prettify() + + class TestWarnings(SoupTest): # Note that some of the tests in this class create BeautifulSoup # objects directly rather than using self.soup(). That's |