diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-09 10:38:14 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-02-09 10:38:14 -0500 |
commit | 5261e6c36d0f0c6cea364390dbe9f4cce66306e8 (patch) | |
tree | 37c0265b0b1f582ac344eaca569bde7c3bee80a9 /bs4/dammit.py | |
parent | 274ceca63fe55336201cd611d897662e5d000e8f (diff) |
Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags like <meta charset="utf-8" />. [bug=837268]
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index 09ac89e..0c4bf17 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -27,8 +27,10 @@ try: except ImportError: pass -xml_encoding_re = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) -html_meta_re = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode(), re.I) +xml_encoding_re = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) class EntitySubstitution(object): |