summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt4
-rw-r--r--bs4/doc/source/index.rst21
-rw-r--r--bs4/element.py9
-rw-r--r--bs4/tests/test_tree.py10
4 files changed, 41 insertions, 3 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 5dbd044..8f16cd5 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,9 @@
= 4.0.0b7 () =
+* Upon decoding to string, any characters that can't be represented in
+ your chosen encoding will be converted into numeric XML entity
+ references.
+
* Issue a warning if characters were replaced with REPLACEMENT
CHARACTER during Unicode conversion.
diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst
index 200317a..0467c00 100644
--- a/bs4/doc/source/index.rst
+++ b/bs4/doc/source/index.rst
@@ -2160,6 +2160,27 @@ element in the soup, just as if it were a Python string::
soup.p.encode("utf-8")
# '<p>Sacr\xc3\xa9 bleu!</p>'
+Any characters that can't be represented in your chosen encoding will
+be converted into numeric XML entity references. For instance, here's
+a document that includes the Unicode character SNOWMAN::
+
+ markup = u"<b>\N{SNOWMAN}</b>"
+ snowman_soup = BeautifulSoup(markup)
+ tag = snowman_soup.b
+
+The SNOWMAN character can be part of a UTF-8 document (it looks like
+☃), but there's no representation for that character in ISO-Latin-1 or
+ASCII, so it's converted into "&#9731" for those encodings::
+
+ print(tag.encode("utf-8"))
+ # <b>☃</b>
+
+ print tag.encode("latin-1")
+ # <b>&#9731;</b>
+
+ print tag.encode("ascii")
+ # <b>&#9731;</b>
+
Unicode, Dammit
---------------
diff --git a/bs4/element.py b/bs4/element.py
index a0f64ba..513407c 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -745,9 +745,12 @@ class Tag(PageElement):
__str__ = __repr__ = __unicode__
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
- indent_level=None, formatter="minimal"):
- return self.decode(indent_level, encoding,
- formatter).encode(encoding)
+ indent_level=None, formatter="minimal",
+ errors="xmlcharrefreplace"):
+ # Turn the data structure into Unicode, then encode the
+ # Unicode.
+ u = self.decode(indent_level, encoding, formatter)
+ return u.encode(encoding, errors=errors)
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 9e57d54..70a7da1 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1242,6 +1242,16 @@ class TestEncoding(SoupTest):
self.assertEqual(
soup.b.encode("utf-8"), html.encode("utf-8"))
+ def test_encoding_substitutes_unrecognized_characters_by_default(self):
+ html = u"<b>\N{SNOWMAN}</b>"
+ soup = self.soup(html)
+ self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
+
+ def test_encoding_can_be_made_strict(self):
+ html = u"<b>\N{SNOWMAN}</b>"
+ soup = self.soup(html)
+ self.assertRaises(
+ UnicodeEncodeError, soup.encode, "ascii", errors="strict")
class TestNavigableStringSubclasses(SoupTest):