From 197470f217ab994dd0ba8e143418a54d69df8523 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 24 Apr 2020 22:13:30 -0400 Subject: If you encode a document with a Python-specific encoding like 'unicode_escape', that encoding is no longer mentioned in the final XML or HTML document. Instead, encoding information is omitted or left blank. [bug=1874955] --- bs4/testing.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'bs4/testing.py') diff --git a/bs4/testing.py b/bs4/testing.py index 328bd56..660cccb 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -15,6 +15,7 @@ from bs4.element import ( Comment, ContentMetaAttributeValue, Doctype, + PYTHON_SPECIFIC_ENCODINGS, SoupStrainer, Script, Stylesheet, @@ -821,6 +822,29 @@ Hello, world! # encoding. self.assertEqual('utf8', charset.encode("utf8")) + def test_python_specific_encodings_not_used_in_charset(self): + # You can encode an HTML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. Instead, the document will appear to + # have no encoding. + for markup in [ + b'' + b'' + ]: + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + u'idna', u'mbcs', u'oem', u'undefined', + u'string_escape', u'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'meta charset=""' in encoded + assert encoding.encode("ascii") not in encoded + def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("text") data.a['foo'] = 'bar' @@ -854,6 +878,25 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) + def test_python_specific_encodings_not_used_in_xml_declaration(self): + # You can encode an XML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. + markup = b"""\n""" + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + u'idna', u'mbcs', u'oem', u'undefined', + u'string_escape', u'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'' in encoded + assert encoding.encode("ascii") not in encoded + def test_processing_instruction(self): markup = b"""\n""" soup = self.soup(markup) -- cgit v1.2.3