diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 09:12:33 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 09:12:33 -0400 |
commit | d284ed9a3b16d7259303171934ade247186eb24f (patch) | |
tree | a2a2c54e54373695c571c605e2be4acc4baaeb9e /bs4/dammit.py | |
parent | c8545a7b00a77d2f606620ef69b36b35291d8174 (diff) |
Limit how much of the document is searched via regular expression for a declared encoding.
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 15 |
1 files changed, 11 insertions, 4 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index 9ea432f..a6b8663 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -286,17 +286,24 @@ class EncodingDetector: return data, encoding @classmethod - def find_declared_encoding(cls, markup, is_html=False): + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): """Given a document, tries to find its declared encoding. An XML encoding is declared at the beginning of the document. - An HTML encoding is declared in a <meta> tag. + An HTML encoding is declared in a <meta> tag, hopefully near the + beginning of the document. """ + if search_entire_document: + xml_endpos = html_endpos = -1 + else: + xml_endpos = 1025 + html_endpos = max(2048, int(len(markup) * 0.05)) + declared_encoding = None - declared_encoding_match = xml_encoding_re.match(markup) + declared_encoding_match = xml_encoding_re.search(markup, xml_endpos) if not declared_encoding_match and is_html: - declared_encoding_match = html_meta_re.search(markup) + declared_encoding_match = html_meta_re.search(markup, html_endpos) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( 'ascii') |