diff options
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/__init__.py | 3 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 2 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 10 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 5 | ||||
-rw-r--r-- | bs4/dammit.py | 34 | ||||
-rw-r--r-- | bs4/doc/source/index.rst | 9 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 21 |
9 files changed, 74 insertions, 18 deletions
@@ -20,6 +20,12 @@ * Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags like <meta charset="utf-8" />. [bug=837268] +* If Unicode, Dammit can't figure out a consistent encoding for a + page, it will try each of its guesses again, with errors="replace" + instead of errors="strict". This may mean that some data gets + replaced with REPLACEMENT CHARACTER, but at least most of it will + get turned into Unicode. [bug=754903] + * Patched over a bug in html5lib (?) that was crashing Beautiful Soup on certain kinds of markup. [bug=838800] diff --git a/bs4/__init__.py b/bs4/__init__.py index e6ad425..6917fa9 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -161,7 +161,8 @@ class BeautifulSoup(Tag): if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - self.markup, self.original_encoding, self.declared_html_encoding = ( + (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) = ( self.builder.prepare_markup(markup, from_encoding)) try: diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 2728606..067623e 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -119,7 +119,7 @@ class TreeBuilder(object): def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): - return markup, None, None + return markup, None, None, False def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 4b80870..9897675 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -29,7 +29,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding - return markup, None, None + return markup, None, None, False # These methods are defined by Beautiful Soup. def feed(self, markup): diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index edd0bfb..c785eed 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -51,16 +51,18 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ - :return: A 3-tuple (markup, original encoding, encoding - declared within markup). + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): - return markup, None, None + return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding) + dammit.declared_html_encoding, + dammit.contains_replacement_characters) def feed(self, markup): super(HTMLParserTreeBuilder, self).feed(markup) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 7219e49..cc3cb86 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -50,12 +50,13 @@ class LXMLTreeBuilderForXML(TreeBuilder): declared within markup). """ if isinstance(markup, unicode): - return markup, None, None + return markup, None, None, False try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding) + dammit.declared_html_encoding, + dammit.contains_replacement_characters) def feed(self, markup): self.parser.feed(markup) diff --git a/bs4/dammit.py b/bs4/dammit.py index 0c4bf17..76ac9ce 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -173,6 +173,7 @@ class UnicodeDammit: self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] + self.contains_replacement_characters = False if markup == '' or isinstance(markup, unicode): self.markup = markup @@ -202,6 +203,20 @@ class UnicodeDammit: if u: break + # As an absolute last resort, try the encodings again with + # character replacement. + if not u: + for proposed_encoding in ( + override_encodings + [ + document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): + if proposed_encoding != "ascii": + u = self._convert_from(proposed_encoding, "replace") + if u is not None: + self.contains_replacement_characters = True + break + + # We could at this point force it to ASCII, but that would + # destroy so much data that I think giving up is better self.unicode_markup = u if not u: self.original_encoding = None @@ -220,11 +235,11 @@ class UnicodeDammit: sub = sub.encode() return sub - def _convert_from(self, proposed): + def _convert_from(self, proposed, errors="strict"): proposed = self.find_codec(proposed) - if not proposed or proposed in self.tried_encodings: + if not proposed or (proposed, errors) in self.tried_encodings: return None - self.tried_encodings.append(proposed) + self.tried_encodings.append((proposed, errors)) markup = self.markup # Convert smart quotes to HTML if coming from an encoding @@ -236,18 +251,19 @@ class UnicodeDammit: markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) try: - # print "Trying to convert document to %s" % proposed - u = self._to_unicode(markup, proposed) + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) self.markup = u self.original_encoding = proposed except Exception as e: - # print "That didn't work!" - # print e + #print "That didn't work!" + #print e return None #print "Correct encoding: %s" % proposed return self.markup - def _to_unicode(self, data, encoding): + def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' @@ -269,7 +285,7 @@ class UnicodeDammit: elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] - newdata = unicode(data, encoding) + newdata = unicode(data, encoding, errors) return newdata def _detectEncoding(self, xml_data, is_html=False): diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst index abea5c6..d28787b 100644 --- a/bs4/doc/source/index.rst +++ b/bs4/doc/source/index.rst @@ -2076,6 +2076,15 @@ We can fix this by passing in the correct ``from_encoding``:: soup.original_encoding 'iso8859-8' +In rare cases (usually when a UTF-8 document contains text written in +a completely different encoding), the only way to get Unicode may be +to replace some characters with the special Unicode character +"REPLACEMENT CHARACTER" (U+FFFD, �). If Unicode, Dammit needs to do +this, it will set the ``.characters_were_replaced`` attribute to +``True`` on the ``UnicodeDammit`` or ``BeautifulSoup`` object. This +lets you know that the Unicode representation is not an exact +representation of the original--some data was lost. + Output encoding --------------- diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index ddfc68c..d744694 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -2,6 +2,7 @@ """Tests of Beautiful Soup as a whole.""" import unittest +from bs4 import BeautifulSoup from bs4.element import SoupStrainer from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import SoupTest @@ -162,3 +163,23 @@ class TestUnicodeDammit(unittest.TestCase): dammit = UnicodeDammit(data, is_html=True) self.assertEquals( "euc-jp", dammit.original_encoding) + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + + doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> +<html><b>\330\250\330\252\330\261</b> +<i>\310\322\321\220\312\321\355\344</i></html>""" + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc) + self.assertTrue(soup.contains_replacement_characters) |