diff options
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 9 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 5 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 4 | ||||
-rw-r--r-- | bs4/dammit.py | 12 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 19 | ||||
-rw-r--r-- | doc/source/index.rst | 13 |
8 files changed, 63 insertions, 9 deletions
@@ -18,6 +18,10 @@ argument described in the documentation. `text` may eventually change its meaning, but not for a very long time. [bug=1366856] +* Added an `exclude_encodings` argument to UnicodeDammit and to the + Beautiful Soup constructor, which lets you prohibit the detection of + an encoding that you know is wrong. [bug=1469408] + * Fixed yet another problem that caused the html5lib tree builder to create a disconnected parse tree. [bug=1237763] diff --git a/bs4/__init__.py b/bs4/__init__.py index 4b92152..e167544 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -80,7 +80,8 @@ class BeautifulSoup(Tag): NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, **kwargs): + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" @@ -202,7 +203,8 @@ class BeautifulSoup(Tag): for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( - self.builder.prepare_markup(markup, from_encoding)): + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() try: self._feed() diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 0778dde..7788063 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -29,9 +29,16 @@ class HTML5TreeBuilder(HTMLTreeBuilder): features = [NAME, PERMISSIVE, HTML_5, HTML] - def prepare_markup(self, markup, user_specified_encoding): + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") yield (markup, None, None, False) # These methods are defined by Beautiful Soup. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index b2cd467..25811f1 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -138,7 +138,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): + document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be @@ -149,7 +149,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): return try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index b0bc8a0..2e33386 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -77,6 +77,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. @@ -102,7 +103,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector(markup, try_encodings, is_html) + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) diff --git a/bs4/dammit.py b/bs4/dammit.py index 7ced3a5..8e6b347 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -213,8 +213,11 @@ class EncodingDetector: 5. Windows-1252. """ - def __init__(self, markup, override_encodings=None, is_html=False): + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None @@ -225,6 +228,8 @@ class EncodingDetector: def _usable(self, encoding, tried): if encoding is not None: encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False if encoding not in tried: tried.add(encoding) return True @@ -332,13 +337,14 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False): + smart_quotes_to=None, is_html=False, exclude_encodings=[]): self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - self.detector = EncodingDetector(markup, override_encodings, is_html) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index e2e2c30..3643aed 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -50,6 +50,11 @@ class TestConstructor(SoupTest): soup = self.soup(data) self.assertEqual(u"foo\0bar", soup.h1.string) + def test_exclude_encodings(self): + utf8_data = u"Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) + class TestWarnings(SoupTest): @@ -322,6 +327,20 @@ class TestUnicodeDammit(unittest.TestCase): dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = u"Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): detected = EncodingDetector( b'<?xml version="1.0" encoding="UTF-\xdb" ?>') diff --git a/doc/source/index.rst b/doc/source/index.rst index 1b7b1e6..821dad4 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2397,6 +2397,19 @@ We can fix this by passing in the correct ``from_encoding``:: soup.original_encoding 'iso8859-8' +If you don't know what the correct encoding is, but you know that +Unicode, Dammit is guessing wrong, you can pass the wrong guesses in +as ``exclude_encodings``:: + + soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"]) + soup.h1 + <h1>םולש</h1> + soup.original_encoding + 'WINDOWS-1255' + +(This isn't 100% correct, but Windows-1255 is a compatible superset of +ISO-8859-8, so it's close enough.) + In rare cases (usually when a UTF-8 document contains text written in a completely different encoding), the only way to get Unicode may be to replace some characters with the special Unicode character |