summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/dammit.py6
-rw-r--r--bs4/tests/test_soup.py11
3 files changed, 18 insertions, 2 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 4535f19..d03c442 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -17,6 +17,9 @@
* Fixed a bug that caused a crash when you passed a dictionary as an
attribute value (possibly because you mistyped "attrs"). [bug=842419]
+* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags
+ like <meta charset="utf-8" />. [bug=837268]
+
* Fixed a bug that wrecked the tree if you replaced an element with an
empty string. [bug=728697]
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 09ac89e..0c4bf17 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -27,8 +27,10 @@ try:
except ImportError:
pass
-xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
-html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I)
+xml_encoding_re = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+ '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index f995678..ddfc68c 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -151,3 +151,14 @@ class TestUnicodeDammit(unittest.TestCase):
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding, 'utf-8')
+
+ def test_detect_html5_style_meta_tag(self):
+
+ for data in (
+ b'<html><meta charset="euc-jp" /></html>',
+ b"<html><meta charset='euc-jp' /></html>",
+ b"<html><meta charset=euc-jp /></html>",
+ b"<html><meta charset=euc-jp/></html>"):
+ dammit = UnicodeDammit(data, is_html=True)
+ self.assertEquals(
+ "euc-jp", dammit.original_encoding)