diff options
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/__init__.py | 3 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 19 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 14 |
3 files changed, 31 insertions, 5 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 03da4c6..b6e2c37 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -234,7 +234,8 @@ class TreeBuilder(object): :param markup: Some markup -- probably a bytestring. :param user_specified_encoding: The user asked to try this encoding. :param document_declared_encoding: The markup itself claims to be - in this encoding. + in this encoding. NOTE: This argument is not used by the + calling code and can probably be removed. :param exclude_encodings: The user asked _not_ to try any of these encodings. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 96a7b7d..2f2bf1e 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -359,9 +359,24 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): return # Ask UnicodeDammit to sniff the most likely encoding. + + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the HTML5 + # spec. (See the EncodingDetector class for details.) + known_definite_encodings = [user_specified_encoding] + + # This was found in the document; treat it as a slightly lower-priority + # user encoding. + user_encodings = [document_declared_encoding] + try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True, - exclude_encodings=exclude_encodings) + dammit = UnicodeDammit( + markup, + known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, + is_html=True, + exclude_encodings=exclude_encodings + ) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 1b44d75..c670b84 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -180,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - try_encodings = [user_specified_encoding, document_declared_encoding] + # This was provided by the end-user; treat it as a known + # definite encoding per the algorithm laid out in the HTML5 + # spec. (See the EncodingDetector class for details.) + known_definite_encodings = [user_specified_encoding] + + # This was found in the document; treat it as a slightly lower-priority + # user encoding. + user_encodings = [document_declared_encoding] detector = EncodingDetector( - markup, try_encodings, is_html, exclude_encodings) + markup, known_definite_encodings=known_definite_encodings, + user_encodings=user_encodings, is_html=is_html, + exclude_encodings=exclude_encodings + ) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) |