diff options
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/doc/source/index.rst | 21 | ||||
-rw-r--r-- | bs4/element.py | 9 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 10 |
4 files changed, 41 insertions, 3 deletions
@@ -1,5 +1,9 @@ = 4.0.0b7 () = +* Upon decoding to string, any characters that can't be represented in + your chosen encoding will be converted into numeric XML entity + references. + * Issue a warning if characters were replaced with REPLACEMENT CHARACTER during Unicode conversion. diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst index 200317a..0467c00 100644 --- a/bs4/doc/source/index.rst +++ b/bs4/doc/source/index.rst @@ -2160,6 +2160,27 @@ element in the soup, just as if it were a Python string:: soup.p.encode("utf-8") # '<p>Sacr\xc3\xa9 bleu!</p>' +Any characters that can't be represented in your chosen encoding will +be converted into numeric XML entity references. For instance, here's +a document that includes the Unicode character SNOWMAN:: + + markup = u"<b>\N{SNOWMAN}</b>" + snowman_soup = BeautifulSoup(markup) + tag = snowman_soup.b + +The SNOWMAN character can be part of a UTF-8 document (it looks like +☃), but there's no representation for that character in ISO-Latin-1 or +ASCII, so it's converted into "☃" for those encodings:: + + print(tag.encode("utf-8")) + # <b>☃</b> + + print tag.encode("latin-1") + # <b>☃</b> + + print tag.encode("ascii") + # <b>☃</b> + Unicode, Dammit --------------- diff --git a/bs4/element.py b/bs4/element.py index a0f64ba..513407c 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -745,9 +745,12 @@ class Tag(PageElement): __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - indent_level=None, formatter="minimal"): - return self.decode(indent_level, encoding, - formatter).encode(encoding) + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors=errors) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 9e57d54..70a7da1 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1242,6 +1242,16 @@ class TestEncoding(SoupTest): self.assertEqual( soup.b.encode("utf-8"), html.encode("utf-8")) + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = u"<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") + + def test_encoding_can_be_made_strict(self): + html = u"<b>\N{SNOWMAN}</b>" + soup = self.soup(html) + self.assertRaises( + UnicodeEncodeError, soup.encode, "ascii", errors="strict") class TestNavigableStringSubclasses(SoupTest): |