diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-09-02 13:01:06 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-09-02 13:01:06 -0400 |
commit | ab0626db2a60f4f22b97ece310d92038b3da5cc1 (patch) | |
tree | bce9ba60aefff198e3ae4c6337f108dcc8ec0aaa | |
parent | cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (diff) |
Avoid a crash when trying to detect the declared encoding of a
Unicode document. Raise an explanatory exception when the underlying parser
completely rejects the incoming markup. [bug=1838877]
-rw-r--r-- | CHANGELOG | 6 | ||||
-rw-r--r-- | bs4/__init__.py | 12 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 15 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 4 | ||||
-rw-r--r-- | bs4/dammit.py | 37 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 88 |
6 files changed, 144 insertions, 18 deletions
@@ -20,6 +20,12 @@ or string ('string_class') encountered during parsing, rather than using the default Tag and NavigableString objects. +* Raise an explanatory exception when the underlying parser + completely rejects the incoming markup. [bug=1838877] + +* Avoid a crash when trying to detect the declared encoding of a + Unicode document. [bug=1838877] + = 4.8.0 (20190720, "One Small Soup") This release focuses on making it easier to customize Beautiful Soup's diff --git a/bs4/__init__.py b/bs4/__init__.py index e27ca6f..e85a0bf 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -302,6 +302,8 @@ class BeautifulSoup(Tag): ' Beautiful Soup.' % markup) self._check_markup_is_url(markup) + rejections = [] + success = False for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( self.builder.prepare_markup( @@ -309,10 +311,18 @@ class BeautifulSoup(Tag): self.reset() try: self._feed() + success = True break - except ParserRejectedMarkup: + except ParserRejectedMarkup as e: + rejections.append(e) pass + if not success: + other_exceptions = [unicode(e) for e in rejections] + raise ParserRejectedMarkup( + u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) + ) + # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index e28242b..7efbf89 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -175,8 +175,8 @@ class TreeBuilder(object): raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - return markup, None, None, False + document_declared_encoding=None, exclude_encodings=None): + yield markup, None, None, False def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. @@ -363,8 +363,15 @@ def register_treebuilders_from(module): this_module.builder_registry.register(obj) class ParserRejectedMarkup(Exception): - pass - + def __init__(self, message_or_exception): + """Explain why the parser rejected the given markup, either + with a textual explanation or another exception. + """ + if isinstance(message_or_exception, Exception): + e = message_or_exception + message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e)) + super(ParserRejectedMarkup, self).__init__(message_or_exception) + # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 85be1b5..ea66d8b 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -175,7 +175,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.parser.feed(data) self.parser.close() except (UnicodeDecodeError, LookupError, etree.ParserError), e: - raise ParserRejectedMarkup(str(e)) + raise ParserRejectedMarkup(e) def close(self): self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] @@ -294,7 +294,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser.feed(markup) self.parser.close() except (UnicodeDecodeError, LookupError, etree.ParserError), e: - raise ParserRejectedMarkup(str(e)) + raise ParserRejectedMarkup(e) def test_fragment_to_document(self, fragment): diff --git a/bs4/dammit.py b/bs4/dammit.py index 08109f2..74fa7f0 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -22,6 +22,8 @@ try: # PyPI package: cchardet import cchardet def chardet_dammit(s): + if isinstance(s, unicode): + return None return cchardet.detect(s)['encoding'] except ImportError: try: @@ -30,6 +32,8 @@ except ImportError: # PyPI package: chardet import chardet def chardet_dammit(s): + if isinstance(s, unicode): + return None return chardet.detect(s)['encoding'] #import chardet.constants #chardet.constants._debug = 1 @@ -44,10 +48,19 @@ try: except ImportError: pass -xml_encoding_re = re.compile( - '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) -html_meta_re = re.compile( - '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) +# Build bytestring and Unicode versions of regular expressions for finding +# a declared encoding inside an XML or HTML document. +xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' +html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' +encoding_res = dict() +encoding_res[bytes] = { + 'html' : re.compile(html_meta.encode("ascii"), re.I), + 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), +} +encoding_res[unicode] = { + 'html' : re.compile(html_meta, re.I), + 'xml' : re.compile(xml_encoding, re.I) +} class EntitySubstitution(object): @@ -319,14 +332,22 @@ class EncodingDetector: xml_endpos = 1024 html_endpos = max(2048, int(len(markup) * 0.05)) + if isinstance(markup, bytes): + res = encoding_res[bytes] + else: + res = encoding_res[unicode] + + xml_re = res['xml'] + html_re = res['html'] declared_encoding = None - declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) + declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) if not declared_encoding_match and is_html: - declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) + declared_encoding_match = html_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: - declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii', 'replace') + declared_encoding = declared_encoding_match.groups()[0] if declared_encoding: + if isinstance(declared_encoding, bytes): + declared_encoding = declared_encoding.decode('ascii', 'replace') return declared_encoding.lower() return None diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index af5f791..3603e81 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -11,6 +11,10 @@ from bs4 import ( BeautifulSoup, BeautifulStoneSoup, ) +from bs4.builder import ( + TreeBuilder, + ParserRejectedMarkup, +) from bs4.element import ( CharsetMetaAttributeValue, Comment, @@ -20,6 +24,7 @@ from bs4.element import ( Tag, NavigableString, ) + import bs4.dammit from bs4.dammit import ( EntitySubstitution, @@ -36,7 +41,7 @@ import warnings try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True -except ImportError, e: +except ImportError as e: LXML_PRESENT = False PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) @@ -65,10 +70,20 @@ class TestConstructor(SoupTest): def __init__(self, **kwargs): self.called_with = kwargs self.is_xml = True + self.store_line_numbers = False + self.cdata_list_attributes = [] + self.preserve_whitespace_tags = [] def initialize_soup(self, soup): pass + def feed(self, markup): + self.fed = markup + def reset(self): + pass + def ignore(self, ignore): + pass + set_up_substitutions = can_be_empty_element = ignore def prepare_markup(self, *args, **kwargs): - return '' + yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters" kwargs = dict( var="value", @@ -80,7 +95,8 @@ class TestConstructor(SoupTest): soup = BeautifulSoup('', builder=Mock, **kwargs) assert isinstance(soup.builder, Mock) self.assertEqual(dict(var="value"), soup.builder.called_with) - + self.assertEqual("prepared markup", soup.builder.fed) + # You can also instantiate the TreeBuilder yourself. In this # case, that specific object is used and any keyword arguments # to the BeautifulSoup constructor are ignored. @@ -94,6 +110,26 @@ class TestConstructor(SoupTest): self.assertEqual(builder, soup.builder) self.assertEqual(kwargs, builder.called_with) + def test_parser_markup_rejection(self): + # If markup is completely rejected by the parser, an + # explanatory ParserRejectedMarkup exception is raised. + class Mock(TreeBuilder): + def feed(self, *args, **kwargs): + raise ParserRejectedMarkup("Nope.") + + def prepare_markup(self, *args, **kwargs): + # We're going to try two different ways of preparing this markup, + # but feed() will reject both of them. + yield markup, None, None, False + yield markup, None, None, False + + import re + self.assertRaisesRegexp( + ParserRejectedMarkup, + "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.", + BeautifulSoup, '', builder=Mock, + ) + def test_cdata_list_attributes(self): # Most attribute values are represented as scalars, but the # HTML standard says that some attributes, like 'class' have @@ -554,6 +590,52 @@ class TestUnicodeDammit(unittest.TestCase): output = UnicodeDammit.detwingle(input) self.assertEqual(output, input) + def test_find_declared_encoding(self): + # Test our ability to find a declared encoding inside an + # XML or HTML document. + # + # Even if the document comes in as Unicode, it may be + # interesting to know what encoding was claimed + # originally. + + html_unicode = u'<html><head><meta charset="utf-8"></head></html>' + html_bytes = html_unicode.encode("ascii") + + xml_unicode= u'<?xml version="1.0" encoding="ISO-8859-1" ?>' + xml_bytes = xml_unicode.encode("ascii") + + m = EncodingDetector.find_declared_encoding + self.assertEquals(None, m(html_unicode, is_html=False)) + self.assertEquals("utf-8", m(html_unicode, is_html=True)) + self.assertEquals("utf-8", m(html_bytes, is_html=True)) + + self.assertEquals("iso-8859-1", m(xml_unicode)) + self.assertEquals("iso-8859-1", m(xml_bytes)) + + # Normally, only the first few kilobytes of a document are checked for + # an encoding. + spacer = b' ' * 5000 + self.assertEquals(None, m(spacer + html_bytes)) + self.assertEquals(None, m(spacer + xml_bytes)) + + # But you can tell find_declared_encoding to search an entire + # HTML document. + self.assertEquals( + "utf-8", + m(spacer + html_bytes, is_html=True, search_entire_document=True) + ) + + # The XML encoding declaration has to be the very first thing + # in the document. We'll allow whitespace before the document + # starts, but nothing else. + self.assertEquals( + "iso-8859-1", + m(xml_bytes, search_entire_document=True) + ) + self.assertEquals( + None, m(b'a' + xml_bytes, search_entire_document=True) + ) + class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none(self): |