diff options
-rw-r--r-- | NEWS.txt | 11 | ||||
-rw-r--r-- | bs4/dammit.py | 5 | ||||
-rw-r--r-- | bs4/doc/source/index.rst | 29 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 15 |
4 files changed, 45 insertions, 15 deletions
@@ -1,7 +1,14 @@ += 4.0.0b7 () = + +* Issue a warning if characters were replaced with REPLACEMENT + CHARACTER during Unicode conversion. + = 4.0.0b6 (20110216) = -* The value of multi-valued attributes like "class" are always turned - into a list, even if there's only one value. +* Multi-valued attributes like "class" always have a list of values, + even if there's only one value in the list. + +* Added a number of multi-valued attributes defined in HTML5. * Stopped generating a space before the slash that closes an empty-element tag. This may come back if I add a special XHTML mode diff --git a/bs4/dammit.py b/bs4/dammit.py index 76ac9ce..a35c213 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -9,6 +9,7 @@ encoding; that's the tree builder's job. import codecs from htmlentitydefs import codepoint2name import re +import warnings # Autodetects character encodings. Very useful. # Download from http://chardet.feedparser.org/ @@ -212,6 +213,10 @@ class UnicodeDammit: if proposed_encoding != "ascii": u = self._convert_from(proposed_encoding, "replace") if u is not None: + warnings.warn( + UnicodeWarning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER.")) self.contains_replacement_characters = True break diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst index 8328ed7..200317a 100644 --- a/bs4/doc/source/index.rst +++ b/bs4/doc/source/index.rst @@ -303,19 +303,24 @@ done by treating the tag as a dictionary:: Multi-valued attributes &&&&&&&&&&&&&&&&&&&&&&& -HTML defines a few attributes that can have multiple values. The most -common is ``class`` (a tag can have more than one CSS class), but -there are a few others: ``rel``, ``rev``, ``archive``, -``accept-charset``, and ``headers``. If one of these attributes has -more than one value, Beautiful Soup will turn its values into a list:: +HTML 4 defines a few attributes that can have multiple values. HTML 5 +removes a couple of them, but defines a few more. The most common +multi-valued attribute is ``class`` (that is, a tag can have more than +one CSS class). Others include ``rel``, ``rev``, ``accept-charset``, +``headers``, and ``accesskey``. Beautiful Soup presents the value(s) +of a multi-valued attribute as a list:: css_soup = BeautifulSoup('<p class="body strikeout"></p>') css_soup.p['class'] # ["body", "strikeout"] + css_soup = BeautifulSoup('<p class="body"></p>') + css_soup.p['class'] + # ["body"] + If an attribute `looks` like it has more than one value, but it's not -one of the special attributes listed above, Beautiful Soup will leave -the attribute alone:: +a multi-valued attribute as defined by any version of the HTML +standard, Beautiful Soup will leave the attribute alone:: id_soup = BeautifulSoup('<p id="my id"></p>') id_soup.p['id'] @@ -326,11 +331,19 @@ consolidated:: rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>') rel_soup.a['rel'] - # 'index' + # ['index'] rel_soup.a['rel'] = ['index', 'contents'] print(rel_soup.p) # <p>Back to the <a rel="index contents">homepage</a></p> +If you parse a document as XML, there are no multi-valued attributes:: + + xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml') + xml_soup.p['class'] + # u'body strikeout' + + + ``NavigableString`` ------------------- diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index d744694..997a01f 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -177,9 +177,14 @@ class TestUnicodeDammit(unittest.TestCase): doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) + with warnings.catch_warnings(record=True) as w: + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc) + self.assertTrue(soup.contains_replacement_characters) - soup = BeautifulSoup(doc) - self.assertTrue(soup.contains_replacement_characters) + msg = w[0].message + self.assertTrue(isinstance(msg, UnicodeWarning)) + self.assertTrue("Some characters could not be decoded" in str(msg)) |