diff options
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/dammit.py | 29 | ||||
-rw-r--r-- | bs4/element.py | 21 | ||||
-rw-r--r-- | doc/source/index.rst | 56 |
4 files changed, 88 insertions, 22 deletions
@@ -5,6 +5,10 @@ * Fixed the handling of " with the built-in parser. [bug=993871] +* Comments, processing instructions, document type declarations, and + markup declarations are now treated as preformatted strings, the way + CData blocks are. [bug=1001025] + = 4.0.5 (20120427) = * Added a new method, wrap(), which wraps an element in a tag. diff --git a/bs4/dammit.py b/bs4/dammit.py index 66a9e9b..58cad9b 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -728,23 +728,29 @@ class UnicodeDammit: LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] @classmethod - def fix_embedded_windows_1252(cls, in_bytes, actual_encoding="utf8"): - """Fix Windows-1252 characters embedded in some other encoding. + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. - Also fixes embedded ISO-8859-1, which is a subset of Windows-1252. - - Currently the only encoding supported is UTF-8. + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. The input must be a bytestring. If you've already converted the document to Unicode, you're too late. - The output is a bytestring in which Windows-1252 characters - have been converted to their UTF-8 equivalents. + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. """ - if actual_encoding.lower() not in ('utf8', 'utf-8'): + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): raise NotImplementedError( - "UTF-8 is the only currently supported encoding " - "for Windows-1252 removal.") + "UTF-8 is the only currently supported main encoding.") byte_chunks = [] @@ -755,7 +761,8 @@ class UnicodeDammit: if not isinstance(byte, int): # Python 2.x byte = ord(byte) - if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER: + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): # This is the start of a UTF-8 multibyte character. Skip # to the end. for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: diff --git a/bs4/element.py b/bs4/element.py index 3ef6ef1..22b8304 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -678,10 +678,12 @@ class NavigableString(unicode, PageElement): return self.PREFIX + output + self.SUFFIX -class CData(NavigableString): +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. - PREFIX = u'<![CDATA[' - SUFFIX = u']]>' + The string will be passed into the formatter (to trigger side effects), + but the return value will be ignored. + """ def output_ready(self, formatter="minimal"): """CData strings are passed into the formatter. @@ -689,25 +691,28 @@ class CData(NavigableString): self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX +class CData(PreformattedString): -class ProcessingInstruction(NavigableString): + PREFIX = u'<![CDATA[' + SUFFIX = u']]>' + +class ProcessingInstruction(PreformattedString): PREFIX = u'<?' SUFFIX = u'?>' - -class Comment(NavigableString): +class Comment(PreformattedString): PREFIX = u'<!--' SUFFIX = u'-->' -class Declaration(NavigableString): +class Declaration(PreformattedString): PREFIX = u'<!' SUFFIX = u'!>' -class Doctype(NavigableString): +class Doctype(PreformattedString): @classmethod def for_name_and_ids(cls, name, pub_id, system_id): diff --git a/doc/source/index.rst b/doc/source/index.rst index 7a7872e..e2d81aa 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -2419,8 +2419,13 @@ be, you can pass them in as a list:: dammit.original_encoding # 'latin-1' -Unicode, Dammit has one special feature that Beautiful Soup doesn't -use. You can use it to convert Microsoft smart quotes to HTML or XML +Unicode, Dammit has two special features that Beautiful Soup doesn't +use. + +Smart quotes +^^^^^^^^^^^^ + +You can use Unicode, Dammit to convert Microsoft smart quotes to HTML or XML entities:: markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>" @@ -2444,6 +2449,51 @@ everything else:: UnicodeDammit(markup, ["windows-1252"]).unicode_markup # u'<p>I just \u201clove\u201d Microsoft Word\u2019s smart quotes</p>' +Inconsistent encodings +^^^^^^^^^^^^^^^^^^^^^^ + +Sometimes a document is mostly in UTF-8, but contains Windows-1252 +characters such as (again) Microsoft smart quotes. This can happen +when a website includes data from multiple sources. You can use +``UnicodeDammit.detwingle()`` to turn such a document into pure +UTF-8. Here's a simple example:: + + snowmen = (u"\N{SNOWMAN}" * 3) + quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}") + doc = snowmen.encode("utf8") + quote.encode("windows_1252") + +This document is a mess. You can display the snowmen or the smart +quotes, but not both:: + + print(doc) + # ☃☃☃�I like snowmen!� + + print(doc.decode("windows-1252")) + # ☃☃☃“I like snowmen!” + +Decoding the document as UTF-8 will raise a ``UnicodeDecodeError``, +but ``UnicodeDammit.detwingle()`` will convert the document to pure +UTF-8, allowing you to decode it and display the snowmen and +quote marks simultaneously:: + + new_doc = UnicodeDammit.detwingle(doc) + print(new_doc.decode("utf8")) + # ☃☃☃“I like snowmen!” + +``UnicodeDammit.detwingle()`` only knows how to handle Windows-1252 +embedded in UTF-8 (or vice versa, I suppose), but this is the most +common case. + +Note that you must know to call ``UnicodeDammit.detwingle()`` on your +data before passing it into ``BeautifulSoup`` or the ``UnicodeDammit`` +constructor. Beautiful Soup assumes that a document has a single +encoding, whatever it might be. If you pass it a document that +contains both UTF-8 and Windows-1252, it's likely to think the whole +document is Windows-1252, and the document will come out looking like +`` ☃☃☃“I like snowmen!”``. + +``UnicodeDammit.detwingle()`` is new in Beautiful Soup 4.1.0. + Parsing only part of a document =============================== @@ -2565,7 +2615,7 @@ By default, Beautiful Soup parses documents as HTML. To parse a document as XML, pass in "xml" as the second argument to the ``BeautifulSoup`` constructor:: -soup = BeautifulSoup(markup, "xml") + soup = BeautifulSoup(markup, "xml") You'll need to :ref:`have lxml installed <parser-installation>`. |