summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/__init__.py3
-rw-r--r--bs4/builder/__init__.py2
-rw-r--r--bs4/builder/_html5lib.py2
-rw-r--r--bs4/builder/_htmlparser.py10
-rw-r--r--bs4/builder/_lxml.py5
-rw-r--r--bs4/dammit.py34
-rw-r--r--bs4/doc/source/index.rst9
-rw-r--r--bs4/tests/test_soup.py21
9 files changed, 74 insertions, 18 deletions
diff --git a/NEWS.txt b/NEWS.txt
index b1df902..7084cde 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -20,6 +20,12 @@
* Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags
like <meta charset="utf-8" />. [bug=837268]
+* If Unicode, Dammit can't figure out a consistent encoding for a
+ page, it will try each of its guesses again, with errors="replace"
+ instead of errors="strict". This may mean that some data gets
+ replaced with REPLACEMENT CHARACTER, but at least most of it will
+ get turned into Unicode. [bug=754903]
+
* Patched over a bug in html5lib (?) that was crashing Beautiful Soup
on certain kinds of markup. [bug=838800]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index e6ad425..6917fa9 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -161,7 +161,8 @@ class BeautifulSoup(Tag):
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
- self.markup, self.original_encoding, self.declared_html_encoding = (
+ (self.markup, self.original_encoding, self.declared_html_encoding,
+ self.contains_replacement_characters) = (
self.builder.prepare_markup(markup, from_encoding))
try:
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 2728606..067623e 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -119,7 +119,7 @@ class TreeBuilder(object):
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
- return markup, None, None
+ return markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 4b80870..9897675 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -29,7 +29,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
- return markup, None, None
+ return markup, None, None, False
# These methods are defined by Beautiful Soup.
def feed(self, markup):
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index edd0bfb..c785eed 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -51,16 +51,18 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
- :return: A 3-tuple (markup, original encoding, encoding
- declared within markup).
+ :return: A 4-tuple (markup, original encoding, encoding
+ declared within markup, whether any characters had to be
+ replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
- return markup, None, None
+ return markup, None, None, False
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding)
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
def feed(self, markup):
super(HTMLParserTreeBuilder, self).feed(markup)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 7219e49..cc3cb86 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -50,12 +50,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
declared within markup).
"""
if isinstance(markup, unicode):
- return markup, None, None
+ return markup, None, None, False
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding)
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
def feed(self, markup):
self.parser.feed(markup)
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 0c4bf17..76ac9ce 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -173,6 +173,7 @@ class UnicodeDammit:
self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
+ self.contains_replacement_characters = False
if markup == '' or isinstance(markup, unicode):
self.markup = markup
@@ -202,6 +203,20 @@ class UnicodeDammit:
if u:
break
+ # As an absolute last resort, try the encodings again with
+ # character replacement.
+ if not u:
+ for proposed_encoding in (
+ override_encodings + [
+ document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
+ if proposed_encoding != "ascii":
+ u = self._convert_from(proposed_encoding, "replace")
+ if u is not None:
+ self.contains_replacement_characters = True
+ break
+
+ # We could at this point force it to ASCII, but that would
+ # destroy so much data that I think giving up is better
self.unicode_markup = u
if not u:
self.original_encoding = None
@@ -220,11 +235,11 @@ class UnicodeDammit:
sub = sub.encode()
return sub
- def _convert_from(self, proposed):
+ def _convert_from(self, proposed, errors="strict"):
proposed = self.find_codec(proposed)
- if not proposed or proposed in self.tried_encodings:
+ if not proposed or (proposed, errors) in self.tried_encodings:
return None
- self.tried_encodings.append(proposed)
+ self.tried_encodings.append((proposed, errors))
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
@@ -236,18 +251,19 @@ class UnicodeDammit:
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
try:
- # print "Trying to convert document to %s" % proposed
- u = self._to_unicode(markup, proposed)
+ #print "Trying to convert document to %s (errors=%s)" % (
+ # proposed, errors)
+ u = self._to_unicode(markup, proposed, errors)
self.markup = u
self.original_encoding = proposed
except Exception as e:
- # print "That didn't work!"
- # print e
+ #print "That didn't work!"
+ #print e
return None
#print "Correct encoding: %s" % proposed
return self.markup
- def _to_unicode(self, data, encoding):
+ def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
@@ -269,7 +285,7 @@ class UnicodeDammit:
elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
- newdata = unicode(data, encoding)
+ newdata = unicode(data, encoding, errors)
return newdata
def _detectEncoding(self, xml_data, is_html=False):
diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst
index abea5c6..d28787b 100644
--- a/bs4/doc/source/index.rst
+++ b/bs4/doc/source/index.rst
@@ -2076,6 +2076,15 @@ We can fix this by passing in the correct ``from_encoding``::
soup.original_encoding
'iso8859-8'
+In rare cases (usually when a UTF-8 document contains text written in
+a completely different encoding), the only way to get Unicode may be
+to replace some characters with the special Unicode character
+"REPLACEMENT CHARACTER" (U+FFFD, �). If Unicode, Dammit needs to do
+this, it will set the ``.characters_were_replaced`` attribute to
+``True`` on the ``UnicodeDammit`` or ``BeautifulSoup`` object. This
+lets you know that the Unicode representation is not an exact
+representation of the original--some data was lost.
+
Output encoding
---------------
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ddfc68c..d744694 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -2,6 +2,7 @@
"""Tests of Beautiful Soup as a whole."""
import unittest
+from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import SoupTest
@@ -162,3 +163,23 @@ class TestUnicodeDammit(unittest.TestCase):
dammit = UnicodeDammit(data, is_html=True)
self.assertEquals(
"euc-jp", dammit.original_encoding)
+
+ def test_last_ditch_entity_replacement(self):
+ # This is a UTF-8 document that contains bytestrings
+ # completely incompatible with UTF-8 (encoded with some other
+ # encoding).
+ #
+ # Since there is no consistent encoding for the document,
+ # Unicode, Dammit will eventually encode the document as UTF-8
+ # and encode the incompatible characters as REPLACEMENT
+ # CHARACTER.
+
+ doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
+<html><b>\330\250\330\252\330\261</b>
+<i>\310\322\321\220\312\321\355\344</i></html>"""
+ dammit = UnicodeDammit(doc)
+ self.assertEqual(True, dammit.contains_replacement_characters)
+ self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+ soup = BeautifulSoup(doc)
+ self.assertTrue(soup.contains_replacement_characters)