diff options
author | Leonard Richardson <leonardr@segfault.org> | 2023-02-15 20:37:18 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2023-02-15 20:37:18 -0500 |
commit | e0bbee776ca241d908af36e4e5ce0d0b1bedceaf (patch) | |
tree | d05d4fb74ebfeb14d8d5fd0a98deec229c1b5789 /bs4/tests/test_htmlparser.py | |
parent | 8432abbfa16efe13cd0c057f91bb42f1f6cb3e36 (diff) |
When the html.parser parser decides it can't parse a document, Beautiful
Soup now consistently propagates this fact by raising a
ParserRejectedMarkup error. [bug=2007343]
Diffstat (limited to 'bs4/tests/test_htmlparser.py')
-rw-r--r-- | bs4/tests/test_htmlparser.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 470d393..a1195d8 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -3,9 +3,11 @@ trees.""" from pdb import set_trace import pickle +import pytest import warnings from bs4.builder import ( HTMLParserTreeBuilder, + ParserRejectedMarkup, XMLParsedAsHTMLWarning, ) from bs4.builder._htmlparser import BeautifulSoupHTMLParser @@ -15,6 +17,28 @@ class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): default_builder = HTMLParserTreeBuilder + def test_rejected_input(self): + # Python's html.parser will occasionally reject markup, + # especially when there is a problem with the initial DOCTYPE + # declaration. Different versions of Python sound the alarm in + # different ways, but Beautiful Soup consistently raises + # errors as ParserRejectedMarkup exceptions. + bad_markup = [ + # https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873 + # https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700 + # https://github.com/python/cpython/issues/81928 + b'\n<![\xff\xfe\xfe\xcd\x00', + + #https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/de32aa55785be29bbc72a1a8e06b00611fb3d9f8 + # https://github.com/python/cpython/issues/78661 + # + b'<![n\x00', + b"<![UNKNOWN[]]>", + ] + for markup in bad_markup: + with pytest.raises(ParserRejectedMarkup): + soup = self.soup(markup) + def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass |