9 files changed, 74 insertions, 18 deletions
diff --git a/NEWS.txt b/NEWS.txt
index b1df902..7084cde 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -20,6 +20,12 @@
 * Unicode, Dammit now detects the encoding in HTML 5-style <meta> tags
   like <meta charset="utf-8" />. [bug=837268]
 
+* If Unicode, Dammit can't figure out a consistent encoding for a
+  page, it will try each of its guesses again, with errors="replace"
+  instead of errors="strict". This may mean that some data gets
+  replaced with REPLACEMENT CHARACTER, but at least most of it will
+  get turned into Unicode. [bug=754903]
+
 * Patched over a bug in html5lib (?) that was crashing Beautiful Soup
   on certain kinds of markup. [bug=838800]
 
diff --git a/bs4/__init__.py b/bs4/__init__.py
index e6ad425..6917fa9 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -161,7 +161,8 @@ class BeautifulSoup(Tag):
 
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
-        self.markup, self.original_encoding, self.declared_html_encoding = (
+        (self.markup, self.original_encoding, self.declared_html_encoding,
+         self.contains_replacement_characters) = (
             self.builder.prepare_markup(markup, from_encoding))
 
         try:
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 2728606..067623e 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -119,7 +119,7 @@ class TreeBuilder(object):
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None):
-        return markup, None, None
+        return markup, None, None, False
 
     def test_fragment_to_document(self, fragment):
         """Wrap an HTML fragment to make it look like a document.
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 4b80870..9897675 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -29,7 +29,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
     def prepare_markup(self, markup, user_specified_encoding):
         # Store the user-specified encoding for use later on.
         self.user_specified_encoding = user_specified_encoding
-        return markup, None, None
+        return markup, None, None, False
 
     # These methods are defined by Beautiful Soup.
     def feed(self, markup):
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index edd0bfb..c785eed 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -51,16 +51,18 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None):
         """
-        :return: A 3-tuple (markup, original encoding, encoding
-        declared within markup).
+        :return: A 4-tuple (markup, original encoding, encoding
+        declared within markup, whether any characters had to be
+        replaced with REPLACEMENT CHARACTER).
         """
         if isinstance(markup, unicode):
-            return markup, None, None
+            return markup, None, None, False
 
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, is_html=True)
         return (dammit.markup, dammit.original_encoding,
-                dammit.declared_html_encoding)
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
 
     def feed(self, markup):
         super(HTMLParserTreeBuilder, self).feed(markup)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 7219e49..cc3cb86 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -50,12 +50,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         declared within markup).
         """
         if isinstance(markup, unicode):
-            return markup, None, None
+            return markup, None, None, False
 
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, is_html=True)
         return (dammit.markup, dammit.original_encoding,
-                dammit.declared_html_encoding)
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
 
     def feed(self, markup):
         self.parser.feed(markup)
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 0c4bf17..76ac9ce 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -173,6 +173,7 @@ class UnicodeDammit:
         self.declared_html_encoding = None
         self.smart_quotes_to = smart_quotes_to
         self.tried_encodings = []
+        self.contains_replacement_characters = False
 
         if markup == '' or isinstance(markup, unicode):
             self.markup = markup
@@ -202,6 +203,20 @@ class UnicodeDammit:
                 if u:
                     break
 
+        # As an absolute last resort, try the encodings again with
+        # character replacement.
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [
+                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
+                if proposed_encoding != "ascii":
+                    u = self._convert_from(proposed_encoding, "replace")
+                if u is not None:
+                    self.contains_replacement_characters = True
+                    break
+
+        # We could at this point force it to ASCII, but that would
+        # destroy so much data that I think giving up is better
         self.unicode_markup = u
         if not u:
             self.original_encoding = None
@@ -220,11 +235,11 @@ class UnicodeDammit:
             sub = sub.encode()
         return sub
 
-    def _convert_from(self, proposed):
+    def _convert_from(self, proposed, errors="strict"):
         proposed = self.find_codec(proposed)
-        if not proposed or proposed in self.tried_encodings:
+        if not proposed or (proposed, errors) in self.tried_encodings:
             return None
-        self.tried_encodings.append(proposed)
+        self.tried_encodings.append((proposed, errors))
         markup = self.markup
 
         # Convert smart quotes to HTML if coming from an encoding
@@ -236,18 +251,19 @@ class UnicodeDammit:
             markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
 
         try:
-            # print "Trying to convert document to %s" % proposed
-            u = self._to_unicode(markup, proposed)
+            #print "Trying to convert document to %s (errors=%s)" % (
+            #    proposed, errors)
+            u = self._to_unicode(markup, proposed, errors)
             self.markup = u
             self.original_encoding = proposed
         except Exception as e:
-            # print "That didn't work!"
-            # print e
+            #print "That didn't work!"
+            #print e
             return None
         #print "Correct encoding: %s" % proposed
         return self.markup
 
-    def _to_unicode(self, data, encoding):
+    def _to_unicode(self, data, encoding, errors="strict"):
         '''Given a string and its encoding, decodes the string into Unicode.
         %encoding is a string recognized by encodings.aliases'''
 
@@ -269,7 +285,7 @@ class UnicodeDammit:
         elif data[:4] == '\xff\xfe\x00\x00':
             encoding = 'utf-32le'
             data = data[4:]
-        newdata = unicode(data, encoding)
+        newdata = unicode(data, encoding, errors)
         return newdata
 
     def _detectEncoding(self, xml_data, is_html=False):
diff --git a/bs4/doc/source/index.rst b/bs4/doc/source/index.rst
index abea5c6..d28787b 100644
--- a/bs4/doc/source/index.rst
+++ b/bs4/doc/source/index.rst
@@ -2076,6 +2076,15 @@ We can fix this by passing in the correct ``from_encoding``::
  soup.original_encoding
  'iso8859-8'
 
+In rare cases (usually when a UTF-8 document contains text written in
+a completely different encoding), the only way to get Unicode may be
+to replace some characters with the special Unicode character
+"REPLACEMENT CHARACTER" (U+FFFD, �). If Unicode, Dammit needs to do
+this, it will set the ``.characters_were_replaced`` attribute to
+``True`` on the ``UnicodeDammit`` or ``BeautifulSoup`` object. This
+lets you know that the Unicode representation is not an exact
+representation of the original--some data was lost.
+
 Output encoding
 ---------------
 
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ddfc68c..d744694 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -2,6 +2,7 @@
 """Tests of Beautiful Soup as a whole."""
 
 import unittest
+from bs4 import BeautifulSoup
 from bs4.element import SoupStrainer
 from bs4.dammit import EntitySubstitution, UnicodeDammit
 from bs4.testing import SoupTest
@@ -162,3 +163,23 @@ class TestUnicodeDammit(unittest.TestCase):
             dammit = UnicodeDammit(data, is_html=True)
             self.assertEquals(
                 "euc-jp", dammit.original_encoding)
+
+    def test_last_ditch_entity_replacement(self):
+        # This is a UTF-8 document that contains bytestrings
+        # completely incompatible with UTF-8 (encoded with some other
+        # encoding).
+        #
+        # Since there is no consistent encoding for the document,
+        # Unicode, Dammit will eventually encode the document as UTF-8
+        # and encode the incompatible characters as REPLACEMENT
+        # CHARACTER.
+
+        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
+<html><b>\330\250\330\252\330\261</b>
+<i>\310\322\321\220\312\321\355\344</i></html>"""
+        dammit = UnicodeDammit(doc)
+        self.assertEqual(True, dammit.contains_replacement_characters)
+        self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+        soup = BeautifulSoup(doc)
+        self.assertTrue(soup.contains_replacement_characters)