summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-09 10:38:14 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-09 10:38:14 -0500
commit5261e6c36d0f0c6cea364390dbe9f4cce66306e8 (patch)
tree37c0265b0b1f582ac344eaca569bde7c3bee80a9 /bs4/dammit.py
parent274ceca63fe55336201cd611d897662e5d000e8f (diff)
Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags like <meta charset="utf-8" />. [bug=837268]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 09ac89e..0c4bf17 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -27,8 +27,10 @@ try:
except ImportError:
pass
-xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
-html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I)
+xml_encoding_re = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+ '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):