diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/dammit.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 11 |
3 files changed, 18 insertions, 2 deletions
@@ -17,6 +17,9 @@ * Fixed a bug that caused a crash when you passed a dictionary as an attribute value (possibly because you mistyped "attrs"). [bug=842419] +* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags + like <meta charset="utf-8" />. [bug=837268] + * Fixed a bug that wrecked the tree if you replaced an element with an empty string. [bug=728697] diff --git a/bs4/dammit.py b/bs4/dammit.py index 09ac89e..0c4bf17 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -27,8 +27,10 @@ try: except ImportError: pass -xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) -html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I) +xml_encoding_re = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index f995678..ddfc68c 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -151,3 +151,14 @@ class TestUnicodeDammit(unittest.TestCase): for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding, 'utf-8') + + def test_detect_html5_style_meta_tag(self): + + for data in ( + b'<html><meta charset="euc-jp" /></html>', + b"<html><meta charset='euc-jp' /></html>", + b"<html><meta charset=euc-jp /></html>", + b"<html><meta charset=euc-jp/></html>"): + dammit = UnicodeDammit(data, is_html=True) + self.assertEquals( + "euc-jp", dammit.original_encoding) |