summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG5
-rw-r--r--bs4/__init__.py10
-rw-r--r--bs4/element.py33
-rw-r--r--bs4/testing.py43
-rw-r--r--doc/source/index.rst44
5 files changed, 110 insertions, 25 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 1c65d5b..7e9eca8 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -12,6 +12,11 @@
* The new NavigableString subclasses (Stylesheet, Script, and
TemplateString) can now be imported directly from the bs4 package.
+* If you encode a document with a Python-specific encoding like
+ 'unicode_escape', that encoding is no longer mentioned in the final
+ XML or HTML document. Instead, encoding information is omitted or
+ left blank. [bug=1874955]
+
* Fixed test failures when run against soupselect 2.0. Patch by Tomáš
Chvátal. [bug=1872279]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 834f180..04dcffc 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -39,6 +39,7 @@ from .element import (
NavigableString,
PageElement,
ProcessingInstruction,
+ PYTHON_SPECIFIC_ENCODINGS,
ResultSet,
Script,
Stylesheet,
@@ -109,7 +110,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
@@ -697,7 +698,7 @@ class BeautifulSoup(Tag):
def handle_data(self, data):
"""Called by the tree builder when a chunk of textual data is encountered."""
self.current_data.append(data)
-
+
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@@ -712,6 +713,11 @@ class BeautifulSoup(Tag):
if self.is_xml:
# Print the XML declaration
encoding_part = ''
+ if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
+ # This is a special Python encoding; it can't actually
+ # go into an XML document because it means nothing
+ # outside of Python.
+ eventual_encoding = None
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
diff --git a/bs4/element.py b/bs4/element.py
index 8c553cd..1744beb 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -43,6 +43,35 @@ def _alias(attr):
return alias
+# These encodings are recognized by Python (so PageElement.encode
+# could theoretically support them) but XML and HTML don't recognize
+# them (so they should not show up in an XML or HTML document as that
+# document's encoding).
+#
+# If an XML document is encoded in one of these encodings, no encoding
+# will be mentioned in the XML declaration. If an HTML document is
+# encoded in one of these encodings, and the HTML document has a
+# <meta> tag that mentions an encoding, the encoding will be given as
+# the empty string.
+#
+# Source:
+# https://docs.python.org/3/library/codecs.html#python-specific-encodings
+PYTHON_SPECIFIC_ENCODINGS = set([
+ u"idna",
+ u"mbcs",
+ u"oem",
+ u"palmos",
+ u"punycode",
+ u"raw_unicode_escape",
+ u"undefined",
+ u"unicode_escape",
+ u"raw-unicode-escape",
+ u"unicode-escape",
+ u"string-escape",
+ u"string_escape",
+])
+
+
class NamespacedAttribute(unicode):
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
('xml') and the name ('lang') that were used to create it.
@@ -85,6 +114,8 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
"""When an HTML document is being encoded to a given encoding, the
value of a meta tag's 'charset' is the name of the encoding.
"""
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
return encoding
@@ -110,6 +141,8 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return obj
def encode(self, encoding):
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
def rewrite(match):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
diff --git a/bs4/testing.py b/bs4/testing.py
index 328bd56..660cccb 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -15,6 +15,7 @@ from bs4.element import (
Comment,
ContentMetaAttributeValue,
Doctype,
+ PYTHON_SPECIFIC_ENCODINGS,
SoupStrainer,
Script,
Stylesheet,
@@ -821,6 +822,29 @@ Hello, world!
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
+ def test_python_specific_encodings_not_used_in_charset(self):
+ # You can encode an HTML document using a Python-specific
+ # encoding, but that encoding won't be mentioned _inside_ the
+ # resulting document. Instead, the document will appear to
+ # have no encoding.
+ for markup in [
+ b'<meta charset="utf8"></head>'
+ b'<meta id="encoding" charset="utf-8" />'
+ ]:
+ soup = self.soup(markup)
+ for encoding in PYTHON_SPECIFIC_ENCODINGS:
+ if encoding in (
+ u'idna', u'mbcs', u'oem', u'undefined',
+ u'string_escape', u'string-escape'
+ ):
+ # For one reason or another, these will raise an
+ # exception if we actually try to use them, so don't
+ # bother.
+ continue
+ encoded = soup.encode(encoding)
+ assert b'meta charset=""' in encoded
+ assert encoding.encode("ascii") not in encoded
+
def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup("<a>text</a>")
data.a['foo'] = 'bar'
@@ -854,6 +878,25 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
+ def test_python_specific_encodings_not_used_in_xml_declaration(self):
+ # You can encode an XML document using a Python-specific
+ # encoding, but that encoding won't be mentioned _inside_ the
+ # resulting document.
+ markup = b"""<?xml version="1.0"?>\n<foo/>"""
+ soup = self.soup(markup)
+ for encoding in PYTHON_SPECIFIC_ENCODINGS:
+ if encoding in (
+ u'idna', u'mbcs', u'oem', u'undefined',
+ u'string_escape', u'string-escape'
+ ):
+ # For one reason or another, these will raise an
+ # exception if we actually try to use them, so don't
+ # bother.
+ continue
+ encoded = soup.encode(encoding)
+ assert b'<?xml version="1.0"?>' in encoded
+ assert encoding.encode("ascii") not in encoded
+
def test_processing_instruction(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
soup = self.soup(markup)
diff --git a/doc/source/index.rst b/doc/source/index.rst
index dbc8c15..148b30f 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -290,10 +290,9 @@ This table summarizes the advantages and disadvantages of each parser library:
+----------------------+--------------------------------------------+--------------------------------+--------------------------+
If you can, I recommend you install and use lxml for speed. If you're
-using a version of Python 2 earlier than 2.7.3, or a version of Python
-3 earlier than 3.2.2, it's `essential` that you install lxml or
-html5lib--Python's built-in HTML parser is just not very good in older
-versions.
+using a very old version of Python -- earlier than 2.7.3 or 3.2.2 --
+it's `essential` that you install lxml or html5lib. Python's built-in
+HTML parser is just not very good in those old versions.
Note that if a document is invalid, different parsers will generate
different Beautiful Soup trees for it. See `Differences
@@ -310,13 +309,13 @@ constructor. You can pass in a string or an open filehandle::
with open("index.html") as fp:
soup = BeautifulSoup(fp)
- soup = BeautifulSoup("<html>data</html>")
+ soup = BeautifulSoup("<html>a web page</html>")
First, the document is converted to Unicode, and HTML entities are
converted to Unicode characters::
- BeautifulSoup("Sacr&eacute; bleu!")
- <html><head></head><body>Sacré bleu!</body></html>
+ print(BeautifulSoup("<html><head></head><body>Sacr&eacute; bleu!</body></html>"))
+ # <html><head></head><body>Sacré bleu!</body></html>
Beautiful Soup then parses the document using the best available
parser. It will use an HTML parser unless you specifically tell it to
@@ -2481,20 +2480,20 @@ Beautiful Soup presents the same interface to a number of different
parsers, but each parser is different. Different parsers will create
different parse trees from the same document. The biggest differences
are between the HTML parsers and the XML parsers. Here's a short
-document, parsed as HTML::
+document, parsed as HTML using the parser that comes with Python::
- BeautifulSoup("<a><b /></a>")
- # <html><head></head><body><a><b></b></a></body></html>
+ BeautifulSoup("<a><b/></a>", "html.parser")
+ # <a><b></b></a>
-Since an empty <b /> tag is not valid HTML, the parser turns it into a
-<b></b> tag pair.
+Since a standalone <b/> tag is not valid HTML, html.parser turns it into
+a <b></b> tag pair.
Here's the same document parsed as XML (running this requires that you
-have lxml installed). Note that the empty <b /> tag is left alone, and
+have lxml installed). Note that the standalone <b/> tag is left alone, and
that the document is given an XML declaration instead of being put
into an <html> tag.::
- BeautifulSoup("<a><b /></a>", "xml")
+ print(BeautifulSoup("<a><b/></a>", "xml"))
# <?xml version="1.0" encoding="utf-8"?>
# <a><b/></a>
@@ -2506,8 +2505,8 @@ document.
But if the document is not perfectly-formed, different parsers will
give different results. Here's a short, invalid document parsed using
-lxml's HTML parser. Note that the dangling </p> tag is simply
-ignored::
+lxml's HTML parser. Note that the <a> tag gets wrapped in <body> and
+<html> tags, and the dangling </p> tag is simply ignored::
BeautifulSoup("<a></p>", "lxml")
# <html><body><a></a></body></html>
@@ -2518,8 +2517,8 @@ Here's the same document parsed using html5lib::
# <html><head></head><body><a><p></p></a></body></html>
Instead of ignoring the dangling </p> tag, html5lib pairs it with an
-opening <p> tag. This parser also adds an empty <head> tag to the
-document.
+opening <p> tag. html5lib also adds an empty <head> tag; lxml didn't
+bother.
Here's the same document parsed with Python's built-in HTML
parser::
@@ -2528,14 +2527,13 @@ parser::
# <a></a>
Like html5lib, this parser ignores the closing </p> tag. Unlike
-html5lib, this parser makes no attempt to create a well-formed HTML
-document by adding a <body> tag. Unlike lxml, it doesn't even bother
-to add an <html> tag.
+html5lib or lxml, this parser makes no attempt to create a
+well-formed HTML document by adding <html> or <body> tags.
Since the document "<a></p>" is invalid, none of these techniques is
-the "correct" way to handle it. The html5lib parser uses techniques
+the 'correct' way to handle it. The html5lib parser uses techniques
that are part of the HTML5 standard, so it has the best claim on being
-the "correct" way, but all three techniques are legitimate.
+the 'correct' way, but all three techniques are legitimate.
Differences between parsers can affect your script. If you're planning
on distributing your script to other people, or running it on multiple