summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 09:12:33 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 09:12:33 -0400
commitd284ed9a3b16d7259303171934ade247186eb24f (patch)
treea2a2c54e54373695c571c605e2be4acc4baaeb9e
parentc8545a7b00a77d2f606620ef69b36b35291d8174 (diff)
Limit how much of the document is searched via regular expression for a declared encoding.
-rw-r--r--bs4/dammit.py15
1 files changed, 11 insertions, 4 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 9ea432f..a6b8663 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -286,17 +286,24 @@ class EncodingDetector:
return data, encoding
@classmethod
- def find_declared_encoding(cls, markup, is_html=False):
+ def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
"""Given a document, tries to find its declared encoding.
An XML encoding is declared at the beginning of the document.
- An HTML encoding is declared in a <meta> tag.
+ An HTML encoding is declared in a <meta> tag, hopefully near the
+ beginning of the document.
"""
+ if search_entire_document:
+ xml_endpos = html_endpos = -1
+ else:
+ xml_endpos = 1025
+ html_endpos = max(2048, int(len(markup) * 0.05))
+
declared_encoding = None
- declared_encoding_match = xml_encoding_re.match(markup)
+ declared_encoding_match = xml_encoding_re.search(markup, xml_endpos)
if not declared_encoding_match and is_html:
- declared_encoding_match = html_meta_re.search(markup)
+ declared_encoding_match = html_meta_re.search(markup, html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
'ascii')