summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/dammit.py29
-rw-r--r--bs4/element.py21
2 files changed, 31 insertions, 19 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 66a9e9b..58cad9b 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -728,23 +728,29 @@ class UnicodeDammit:
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
@classmethod
- def fix_embedded_windows_1252(cls, in_bytes, actual_encoding="utf8"):
- """Fix Windows-1252 characters embedded in some other encoding.
+ def detwingle(cls, in_bytes, main_encoding="utf8",
+ embedded_encoding="windows-1252"):
+ """Fix characters from one encoding embedded in some other encoding.
- Also fixes embedded ISO-8859-1, which is a subset of Windows-1252.
-
- Currently the only encoding supported is UTF-8.
+ Currently the only situation supported is Windows-1252 (or its
+ subset ISO-8859-1), embedded in UTF-8.
The input must be a bytestring. If you've already converted
the document to Unicode, you're too late.
- The output is a bytestring in which Windows-1252 characters
- have been converted to their UTF-8 equivalents.
+ The output is a bytestring in which `embedded_encoding`
+ characters have been converted to their `main_encoding`
+ equivalents.
"""
- if actual_encoding.lower() not in ('utf8', 'utf-8'):
+ if embedded_encoding.replace('_', '-').lower() not in (
+ 'windows-1252', 'windows_1252'):
+ raise NotImplementedError(
+ "Windows-1252 and ISO-8859-1 are the only currently supported "
+ "embedded encodings.")
+
+ if main_encoding.lower() not in ('utf8', 'utf-8'):
raise NotImplementedError(
- "UTF-8 is the only currently supported encoding "
- "for Windows-1252 removal.")
+ "UTF-8 is the only currently supported main encoding.")
byte_chunks = []
@@ -755,7 +761,8 @@ class UnicodeDammit:
if not isinstance(byte, int):
# Python 2.x
byte = ord(byte)
- if byte >= cls.FIRST_MULTIBYTE_MARKER and byte <= cls.LAST_MULTIBYTE_MARKER:
+ if (byte >= cls.FIRST_MULTIBYTE_MARKER
+ and byte <= cls.LAST_MULTIBYTE_MARKER):
# This is the start of a UTF-8 multibyte character. Skip
# to the end.
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
diff --git a/bs4/element.py b/bs4/element.py
index 3ef6ef1..22b8304 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -678,10 +678,12 @@ class NavigableString(unicode, PageElement):
return self.PREFIX + output + self.SUFFIX
-class CData(NavigableString):
+class PreformattedString(NavigableString):
+ """A NavigableString not subject to the normal formatting rules.
- PREFIX = u'<![CDATA['
- SUFFIX = u']]>'
+ The string will be passed into the formatter (to trigger side effects),
+ but the return value will be ignored.
+ """
def output_ready(self, formatter="minimal"):
"""CData strings are passed into the formatter.
@@ -689,25 +691,28 @@ class CData(NavigableString):
self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
+class CData(PreformattedString):
-class ProcessingInstruction(NavigableString):
+ PREFIX = u'<![CDATA['
+ SUFFIX = u']]>'
+
+class ProcessingInstruction(PreformattedString):
PREFIX = u'<?'
SUFFIX = u'?>'
-
-class Comment(NavigableString):
+class Comment(PreformattedString):
PREFIX = u'<!--'
SUFFIX = u'-->'
-class Declaration(NavigableString):
+class Declaration(PreformattedString):
PREFIX = u'<!'
SUFFIX = u'!>'
-class Doctype(NavigableString):
+class Doctype(PreformattedString):
@classmethod
def for_name_and_ids(cls, name, pub_id, system_id):