summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/dammit.py6
-rw-r--r--bs4/tests/test_soup.py11
2 files changed, 15 insertions, 2 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 09ac89e..0c4bf17 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -27,8 +27,10 @@ try:
except ImportError:
pass
-xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
-html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I)
+xml_encoding_re = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+ '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index f995678..ddfc68c 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -151,3 +151,14 @@ class TestUnicodeDammit(unittest.TestCase):
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding, 'utf-8')
+
+ def test_detect_html5_style_meta_tag(self):
+
+ for data in (
+ b'<html><meta charset="euc-jp" /></html>',
+ b"<html><meta charset='euc-jp' /></html>",
+ b"<html><meta charset=euc-jp /></html>",
+ b"<html><meta charset=euc-jp/></html>"):
+ dammit = UnicodeDammit(data, is_html=True)
+ self.assertEquals(
+ "euc-jp", dammit.original_encoding)