summaryrefslogtreecommitdiff
path: root/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder')
-rw-r--r--bs4/builder/__init__.py3
-rw-r--r--bs4/builder/_htmlparser.py19
-rw-r--r--bs4/builder/_lxml.py14
3 files changed, 31 insertions, 5 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 03da4c6..b6e2c37 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -234,7 +234,8 @@ class TreeBuilder(object):
:param markup: Some markup -- probably a bytestring.
:param user_specified_encoding: The user asked to try this encoding.
:param document_declared_encoding: The markup itself claims to be
- in this encoding.
+ in this encoding. NOTE: This argument is not used by the
+ calling code and can probably be removed.
:param exclude_encodings: The user asked _not_ to try any of
these encodings.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 96a7b7d..2f2bf1e 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -359,9 +359,24 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
return
# Ask UnicodeDammit to sniff the most likely encoding.
+
+ # This was provided by the end-user; treat it as a known
+ # definite encoding per the algorithm laid out in the HTML5
+ # spec. (See the EncodingDetector class for details.)
+ known_definite_encodings = [user_specified_encoding]
+
+ # This was found in the document; treat it as a slightly lower-priority
+ # user encoding.
+ user_encodings = [document_declared_encoding]
+
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True,
- exclude_encodings=exclude_encodings)
+ dammit = UnicodeDammit(
+ markup,
+ known_definite_encodings=known_definite_encodings,
+ user_encodings=user_encodings,
+ is_html=True,
+ exclude_encodings=exclude_encodings
+ )
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 1b44d75..c670b84 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -180,9 +180,19 @@ class LXMLTreeBuilderForXML(TreeBuilder):
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
- try_encodings = [user_specified_encoding, document_declared_encoding]
+ # This was provided by the end-user; treat it as a known
+ # definite encoding per the algorithm laid out in the HTML5
+ # spec. (See the EncodingDetector class for details.)
+ known_definite_encodings = [user_specified_encoding]
+
+ # This was found in the document; treat it as a slightly lower-priority
+ # user encoding.
+ user_encodings = [document_declared_encoding]
detector = EncodingDetector(
- markup, try_encodings, is_html, exclude_encodings)
+ markup, known_definite_encodings=known_definite_encodings,
+ user_encodings=user_encodings, is_html=is_html,
+ exclude_encodings=exclude_encodings
+ )
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)