Comments, processing instructions, document type declarations, and markup declarations are now treated as preformatted strings, the way CData blocks are. [bug=1001025] Also in this commit: renamed detwingle method to detwingle().

author: Leonard Richardson <leonardr@segfault.org> 2012-05-24 08:14:37 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2012-05-24 08:14:37 -0400
commit: c84e08aa77764578ca1be2a322a4a7bed12d6851 (patch)
tree: 21981a39db565ca75e22b9ab6d242e7a0121fa09
parent: 0401057f29c9c8e6ee781aa9ca6fd1a395a4b084 (diff)
4 files changed, 88 insertions, 22 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 934246b..acfb93d 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -5,6 +5,10 @@
 
 * Fixed the handling of &quot; with the built-in parser. [bug=993871]
 
+* Comments, processing instructions, document type declarations, and
+  markup declarations are now treated as preformatted strings, the way
+  CData blocks are. [bug=1001025]
+
 = 4.0.5 (20120427) =
 
 * Added a new method, wrap(), which wraps an element in a tag.
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 66a9e9b..58cad9b 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -728,23 +728,29 @@ class UnicodeDammit:
     LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
 
     @classmethod
-    def fix_embedded_windows_1252(cls, in_bytes, actual_encoding="utf8"):
-        """Fix Windows-1252 characters embedded in some other encoding.
+    def detwingle(cls, in_bytes, main_encoding="utf8",
+                  embedded_encoding="windows-1252"):
+        """Fix characters from one encoding embedded in some other encoding.
 
-        Also fixes embedded ISO-8859-1, which is a subset of Windows-1252.
-
-        Currently the only encoding supported is UTF-8.
+        Currently the only situation supported is Windows-1252 (or its
+        subset ISO-8859-1), embedded in UTF-8.
 
         The input must be a bytestring. If you've already converted
         the document to Unicode, you're too late.
 
-        The output is a bytestring in which Windows-1252 characters
-        have been converted to their UTF-8 equivalents.
+        The output is a bytestring in which `embedded_encoding`
+        characters have been converted to their `main_encoding`
+        equivalents.
         """
-        if actual_encoding.lower() not in ('utf8', 'utf-8'):
+        if embedded_encoding.replace('_', '-').lower() not in (
+            'windows-1252', 'windows_1252'):
+            raise NotImplementedError(
+                "Windows-1252 and ISO-8859-1 are the only currently supported "
+                "embedded encodings.")
+
+        if main_encoding.lower() not in ('utf8', 'utf-8'):
             raise NotImplementedError(
-                "UTF-8 is the only currently supported encoding "
-                "for Windows-1252 removal.")
+                "UTF-8 is the only currently supported main encoding.")
 
         byte_chunks = []
 
@@ -755,7 +761,8 @@ class UnicodeDammit:
             if not isinstance(byte, int):
                 # Python 2.x
                 byte = ord(byte)
-            if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
+            if (byte >= cls.FIRST_MULTIBYTE_MARKER
+                and byte <= cls.LAST_MULTIBYTE_MARKER):
                 # This is the start of a UTF-8 multibyte character. Skip
                 # to the end.
                 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
diff --git a/bs4/element.py b/bs4/element.py
index 3ef6ef1..22b8304 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -678,10 +678,12 @@ class NavigableString(unicode, PageElement):
         return self.PREFIX + output + self.SUFFIX
 
 
-class CData(NavigableString):
+class PreformattedString(NavigableString):
+    """A NavigableString not subject to the normal formatting rules.
 
-    PREFIX = u'<![CDATA['
-    SUFFIX = u']]>'
+    The string will be passed into the formatter (to trigger side effects),
+    but the return value will be ignored.
+    """
 
     def output_ready(self, formatter="minimal"):
         """CData strings are passed into the formatter.
@@ -689,25 +691,28 @@ class CData(NavigableString):
         self.format_string(self, formatter)
         return self.PREFIX + self + self.SUFFIX
 
+class CData(PreformattedString):
 
-class ProcessingInstruction(NavigableString):
+    PREFIX = u'<![CDATA['
+    SUFFIX = u']]>'
+
+class ProcessingInstruction(PreformattedString):
 
     PREFIX = u'<?'
     SUFFIX = u'?>'
 
-
-class Comment(NavigableString):
+class Comment(PreformattedString):
 
     PREFIX = u'<!--'
     SUFFIX = u'-->'
 
 
-class Declaration(NavigableString):
+class Declaration(PreformattedString):
     PREFIX = u'<!'
     SUFFIX = u'!>'
 
 
-class Doctype(NavigableString):
+class Doctype(PreformattedString):
 
     @classmethod
     def for_name_and_ids(cls, name, pub_id, system_id):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 7a7872e..e2d81aa 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2419,8 +2419,13 @@ be, you can pass them in as a list::
  dammit.original_encoding
  # 'latin-1'
 
-Unicode, Dammit has one special feature that Beautiful Soup doesn't
-use. You can use it to convert Microsoft smart quotes to HTML or XML
+Unicode, Dammit has two special features that Beautiful Soup doesn't
+use.
+
+Smart quotes
+^^^^^^^^^^^^
+
+You can use Unicode, Dammit to convert Microsoft smart quotes to HTML or XML
 entities::
 
  markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>"
@@ -2444,6 +2449,51 @@ everything else::
  UnicodeDammit(markup, ["windows-1252"]).unicode_markup
  # u'<p>I just \u201clove\u201d Microsoft Word\u2019s smart quotes</p>'
 
+Inconsistent encodings
+^^^^^^^^^^^^^^^^^^^^^^
+
+Sometimes a document is mostly in UTF-8, but contains Windows-1252
+characters such as (again) Microsoft smart quotes. This can happen
+when a website includes data from multiple sources. You can use
+``UnicodeDammit.detwingle()`` to turn such a document into pure
+UTF-8. Here's a simple example::
+
+ snowmen = (u"\N{SNOWMAN}" * 3)
+ quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
+ doc = snowmen.encode("utf8") + quote.encode("windows_1252")
+
+This document is a mess. You can display the snowmen or the smart
+quotes, but not both::
+
+ print(doc)
+ # ☃☃☃�I like snowmen!�
+
+ print(doc.decode("windows-1252"))
+ # â˜ƒâ˜ƒâ˜ƒ“I like snowmen!”
+
+Decoding the document as UTF-8 will raise a ``UnicodeDecodeError``,
+but ``UnicodeDammit.detwingle()`` will convert the document to pure
+UTF-8, allowing you to decode it and display the snowmen and
+quote marks simultaneously::
+
+ new_doc = UnicodeDammit.detwingle(doc)
+ print(new_doc.decode("utf8"))
+ # ☃☃☃“I like snowmen!”
+
+``UnicodeDammit.detwingle()`` only knows how to handle Windows-1252
+embedded in UTF-8 (or vice versa, I suppose), but this is the most
+common case.
+
+Note that you must know to call ``UnicodeDammit.detwingle()`` on your
+data before passing it into ``BeautifulSoup`` or the ``UnicodeDammit``
+constructor. Beautiful Soup assumes that a document has a single
+encoding, whatever it might be. If you pass it a document that
+contains both UTF-8 and Windows-1252, it's likely to think the whole
+document is Windows-1252, and the document will come out looking like
+`` â˜ƒâ˜ƒâ˜ƒ“I like snowmen!”``.
+
+``UnicodeDammit.detwingle()`` is new in Beautiful Soup 4.1.0.
+
 Parsing only part of a document
 ===============================
 
@@ -2565,7 +2615,7 @@ By default, Beautiful Soup parses documents as HTML. To parse a
 document as XML, pass in "xml" as the second argument to the
 ``BeautifulSoup`` constructor::
 
-soup = BeautifulSoup(markup, "xml")
+ soup = BeautifulSoup(markup, "xml")
 
 You'll need to :ref:`have lxml installed <parser-installation>`.
author	Leonard Richardson <leonardr@segfault.org>	2012-05-24 08:14:37 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2012-05-24 08:14:37 -0400
commit	c84e08aa77764578ca1be2a322a4a7bed12d6851 (patch)
tree	21981a39db565ca75e22b9ab6d242e7a0121fa09
parent	0401057f29c9c8e6ee781aa9ca6fd1a395a4b084 (diff)