Limit how much of the document is searched via regular expression for a declared encoding.

author: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-03 09:12:33 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-03 09:12:33 -0400
commit: d284ed9a3b16d7259303171934ade247186eb24f (patch)
tree: a2a2c54e54373695c571c605e2be4acc4baaeb9e
parent: c8545a7b00a77d2f606620ef69b36b35291d8174 (diff)
1 files changed, 11 insertions, 4 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 9ea432f..a6b8663 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -286,17 +286,24 @@ class EncodingDetector:
         return data, encoding
 
     @classmethod
-    def find_declared_encoding(cls, markup, is_html=False):
+    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
         """Given a document, tries to find its declared encoding.
 
         An XML encoding is declared at the beginning of the document.
 
-        An HTML encoding is declared in a <meta> tag.
+        An HTML encoding is declared in a <meta> tag, hopefully near the
+        beginning of the document.
         """
+        if search_entire_document:
+            xml_endpos = html_endpos = -1
+        else:
+            xml_endpos = 1025
+            html_endpos = max(2048, int(len(markup) * 0.05))
+            
         declared_encoding = None
-        declared_encoding_match = xml_encoding_re.match(markup)
+        declared_encoding_match = xml_encoding_re.search(markup, xml_endpos)
         if not declared_encoding_match and is_html:
-            declared_encoding_match = html_meta_re.search(markup)
+            declared_encoding_match = html_meta_re.search(markup, html_endpos)
         if declared_encoding_match is not None:
             declared_encoding = declared_encoding_match.groups()[0].decode(
                 'ascii')
author	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-03 09:12:33 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-03 09:12:33 -0400
commit	d284ed9a3b16d7259303171934ade247186eb24f (patch)
tree	a2a2c54e54373695c571c605e2be4acc4baaeb9e
parent	c8545a7b00a77d2f606620ef69b36b35291d8174 (diff)