summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-04-24 22:13:30 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-04-24 22:13:30 -0400
commit197470f217ab994dd0ba8e143418a54d69df8523 (patch)
treef962eb0fa745c772f63617abfb9caaee2f7c233d /bs4/__init__.py
parente0d4d8de8f9608e3ac2d637544c786958b3d34b4 (diff)
If you encode a document with a Python-specific encoding like
'unicode_escape', that encoding is no longer mentioned in the final XML or HTML document. Instead, encoding information is omitted or left blank. [bug=1874955]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py10
1 files changed, 8 insertions, 2 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 834f180..04dcffc 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -39,6 +39,7 @@ from .element import (
NavigableString,
PageElement,
ProcessingInstruction,
+ PYTHON_SPECIFIC_ENCODINGS,
ResultSet,
Script,
Stylesheet,
@@ -109,7 +110,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
@@ -697,7 +698,7 @@ class BeautifulSoup(Tag):
def handle_data(self, data):
"""Called by the tree builder when a chunk of textual data is encountered."""
self.current_data.append(data)
-
+
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@@ -712,6 +713,11 @@ class BeautifulSoup(Tag):
if self.is_xml:
# Print the XML declaration
encoding_part = ''
+ if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
+ # This is a special Python encoding; it can't actually
+ # go into an XML document because it means nothing
+ # outside of Python.
+ eventual_encoding = None
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part