From ea23194367fb36d201cf6b8134601a73070dff63 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 30 May 2013 11:33:00 -0400 Subject: Split out the code that guesses at encodings from the code that tries to decode a bytestring based on those encodings. This is necessary because lxml wants to do the decoding itself. --- bs4/dammit.py | 317 +++++++++++++++++++++++++++++-------------------- bs4/diagnose.py | 4 +- bs4/tests/test_soup.py | 24 ++-- 3 files changed, 208 insertions(+), 137 deletions(-) diff --git a/bs4/dammit.py b/bs4/dammit.py index a733cad..71281d2 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -11,6 +11,7 @@ import codecs from htmlentitydefs import codepoint2name import re import logging +import string # Import a library to autodetect character encodings. chardet_type = None @@ -175,7 +176,6 @@ class EntitySubstitution(object): value = cls.quoted_attribute_value(value) return value - @classmethod def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. @@ -192,6 +192,180 @@ class EntitySubstitution(object): cls._substitute_html_entity, s) +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding implied by a short substring at the beginning of + the bytestring, such as a byte-order mark. + + 3. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a tag (if the bytestring is to be + interpreted as an HTML document.) + + 4. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 5. UTF-8. + + 6. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False): + self.markup = markup + self.override_encodings = override_encodings or [] + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + self.sniffed_encoding = None + + def _yield(self, encoding, tried): + if encoding not in tried and encoding is not None: + yield encoding + tried.add(encoding) + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + for x in self._yield(e, tried): + yield x + + if self.sniffed_encoding is None: + (self.markup, + self.sniffed_encoding) = self.strip_byte_order_mark(self.markup) + for x in self._yield(self.sniffed_encoding, tried): + yield x + + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + + if self.sniffed_encoding and self.declared_encoding in ( + 'iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16'): + # We were able to sniff an encoding by looking at the + # first part of the document. The declared encoding is redundant + # with the sniffed encoding. + # + # TODO: Is it really? How do we know? What if the BOM + # says UTF-32 and the declaration says UTF-16? In real + # usage this doesn't matter because this method is + # only called if the sniffed encoding didn't work. + self.declared_encoding = self.sniffed_encoding + + if self.declared_encoding is not None: + for x in self._yield(self.declared_encoding, tried): + yield x + + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self.chardet_encoding is not None: + for x in self._yield(self.chardet_encoding, tried): + yield x + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + for x in self._yield(e, tried): + yield x + + @classmethod + def strip_byte_order_mark(cls, markup): + "Remove a byte-order mark from a document, and guess at its encoding." + if markup[:4] == b'\x4c\x6f\xa7\x94': + # EBCDIC + # There's no 'ebcdic' codec, so just convert the ebsdic to ASCII. + markup = self.ebcdic_to_ascii(markup) + elif markup[:4] == b'\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_encoding = 'utf-16be' + elif (len(markup) >= 4) and (markup[:2] == b'\xfe\xff') \ + and (markup[2:4] != b'\x00\x00'): + # UTF-16BE with BOM + sniffed_encoding = 'utf-16be' + markup = markup[2:] + elif markup[:4] == b'\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_encoding = 'utf-16le' + elif (len(markup) >= 4) and (markup[:2] == b'\xff\xfe') and \ + (markup[2:4] != b'\x00\x00'): + # UTF-16LE with BOM + sniffed_encoding = 'utf-16le' + markup = markup[2:] + elif markup[:4] == b'\x00\x00\x00\x3c': + # UTF-32BE + sniffed_encoding = 'utf-32be' + elif markup[:4] == b'\x3c\x00\x00\x00': + # UTF-32LE + sniffed_encoding = 'utf-32le' + elif markup[:4] == b'\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_encoding = 'utf-32be' + markup = markup[4:] + elif markup[:4] == b'\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_encoding = 'utf-32le' + markup = markup[4:] + elif markup[:3] == b'\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_encoding = 'utf-8' + markup = markup[3:] + else: + # No idea. + sniffed_encoding = None + return markup, sniffed_encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a tag. + """ + declared_encoding = None + declared_encoding_match = xml_encoding_re.match(markup) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii').lower() + return declared_encoding + + EBCDIC_TO_ASCII_MAP = None + + @classmethod + def ebcdic_to_ascii(cls, s): + if not cls.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + cls.EBCDIC_TO_ASCII_MAP = string.maketrans( + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) + return s.translate(cls.EBCDIC_TO_ASCII_MAP) + + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is @@ -224,44 +398,22 @@ class UnicodeDammit: self.original_encoding = None return - new_markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, is_html) - self.markup = new_markup + self.detector = EncodingDetector(markup, override_encodings, is_html) + self.markup, ignore = self.detector.strip_byte_order_mark(markup) u = None - if new_markup != markup: - # _detectEncoding modified the markup, then converted it to - # Unicode and then to UTF-8. So convert it from UTF-8. - u = self._convert_from("utf8") - self.original_encoding = sniffed_encoding - - if not u: - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break - - # If no luck and we have auto-detection library, try that: - if not u and not isinstance(self.markup, unicode): - u = self._convert_from(chardet_dammit(self.markup)) + for encoding in self.detector.encodings: + u = self._convert_from(encoding) + if u is not None: + break - # As a last resort, try utf-8 and windows-1252: if not u: - for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convert_from(proposed_encoding) - if u: - break + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. - # As an absolute last resort, try the encodings again with - # character replacement. - if not u: - for proposed_encoding in ( - override_encodings + [ - document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): - if proposed_encoding != "ascii": - u = self._convert_from(proposed_encoding, "replace") + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") if u is not None: logging.warning( "Some characters could not be decoded, and were " @@ -269,8 +421,9 @@ class UnicodeDammit: self.contains_replacement_characters = True break - # We could at this point force it to ASCII, but that would - # destroy so much data that I think giving up is better + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. self.unicode_markup = u if not u: self.original_encoding = None @@ -344,72 +497,6 @@ class UnicodeDammit: newdata = unicode(data, encoding, errors) return newdata - def _detectEncoding(self, xml_data, is_html=False): - """Given a document, tries to detect its XML encoding.""" - xml_encoding = sniffed_xml_encoding = None - try: - if xml_data[:4] == b'\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == b'\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ - and (xml_data[2:4] != b'\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ - (xml_data[2:4] != b'\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == b'\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - sniffed_xml_encoding = 'ascii' - pass - except: - xml_encoding_match = None - xml_encoding_match = xml_encoding_re.match(xml_data) - if not xml_encoding_match and is_html: - xml_encoding_match = html_meta_re.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if is_html: - self.declared_html_encoding = xml_encoding - if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - return xml_data, xml_encoding, sniffed_xml_encoding - def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ @@ -427,32 +514,6 @@ class UnicodeDammit: pass return codec - EBCDIC_TO_ASCII_MAP = None - - def _ebcdic_to_ascii(self, s): - c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( - ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) - return s.translate(c.EBCDIC_TO_ASCII_MAP) # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. MS_CHARS = {b'\x80': ('euro', '20AC'), diff --git a/bs4/diagnose.py b/bs4/diagnose.py index 25fda5c..f9bff28 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -61,14 +61,14 @@ def diagnose(data): print "-" * 80 -def lxml_trace(data, html=True): +def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. This lets you see how lxml parses a document when no Beautiful Soup code is running. """ from lxml import etree - for event, element in etree.iterparse(StringIO(data), html=html): + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): print("%s, %4s, %s" % (event, element.tag, element.text)) class AnnouncingParser(HTMLParser): diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b127716..b8a1aed 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -156,13 +156,23 @@ class TestEncodingConversion(SoupTest): def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding - # attribute is set. - ascii = b"a" - soup_from_ascii = self.soup(ascii) - unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) - self.assertEqual(unicode_output, self.document_for(ascii.decode())) - self.assertEqual(soup_from_ascii.original_encoding.lower(), "ascii") + # attribute is set to 'utf-8', a superset of ASCII. + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + # Disable chardet, which will realize that the ASCII is ASCII. + bs4.dammit.chardet_dammit = noop + ascii = b"a" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, unicode)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute -- cgit v1.2.3 From 342da7818966498e1fc2100c0b920cbc242c9831 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 30 May 2013 12:43:22 -0400 Subject: Refactored code a bit. --- bs4/dammit.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/bs4/dammit.py b/bs4/dammit.py index 71281d2..cb6d354 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -223,24 +223,25 @@ class EncodingDetector: self.declared_encoding = None self.sniffed_encoding = None - def _yield(self, encoding, tried): + def _usable(self, encoding, tried): if encoding not in tried and encoding is not None: - yield encoding tried.add(encoding) + return True + return False @property def encodings(self): """Yield a number of encodings that might work for this markup.""" tried = set() for e in self.override_encodings: - for x in self._yield(e, tried): - yield x + if self._usable(e, tried): + yield e if self.sniffed_encoding is None: (self.markup, self.sniffed_encoding) = self.strip_byte_order_mark(self.markup) - for x in self._yield(self.sniffed_encoding, tried): - yield x + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding if self.declared_encoding is None: self.declared_encoding = self.find_declared_encoding( @@ -261,20 +262,18 @@ class EncodingDetector: # only called if the sniffed encoding didn't work. self.declared_encoding = self.sniffed_encoding - if self.declared_encoding is not None: - for x in self._yield(self.declared_encoding, tried): - yield x + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding if self.chardet_encoding is None: self.chardet_encoding = chardet_dammit(self.markup) - if self.chardet_encoding is not None: - for x in self._yield(self.chardet_encoding, tried): - yield x + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding # As a last-ditch effort, try utf-8 and windows-1252. for e in ('utf-8', 'windows-1252'): - for x in self._yield(e, tried): - yield x + if self._usable(e, tried): + yield e @classmethod def strip_byte_order_mark(cls, markup): -- cgit v1.2.3 From 19f05a586c79b86be8ebe06a3728ab9a94162bee Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Fri, 31 May 2013 09:17:11 -0400 Subject: Create a new lxml parser object for every new parsing strategy. --- bs4/__init__.py | 22 +++++------ bs4/builder/__init__.py | 3 ++ bs4/builder/_html5lib.py | 2 +- bs4/builder/_htmlparser.py | 9 +++-- bs4/builder/_lxml.py | 92 +++++++++++++++++++++++++++++++--------------- bs4/dammit.py | 21 ++++++++--- bs4/testing.py | 13 +++++++ bs4/tests/test_lxml.py | 6 ++- 8 files changed, 115 insertions(+), 53 deletions(-) diff --git a/bs4/__init__.py b/bs4/__init__.py index a949d6d..956f26e 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -26,7 +26,7 @@ __all__ = ['BeautifulSoup'] import re import warnings -from .builder import builder_registry +from .builder import builder_registry, ParserRejectedMarkup from .dammit import UnicodeDammit from .element import ( CData, @@ -160,18 +160,17 @@ class BeautifulSoup(Tag): self.parse_only = parse_only - self.reset() - if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) = ( - self.builder.prepare_markup(markup, from_encoding)) - - try: - self._feed() - except StopParsing: - pass + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup(markup, from_encoding)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup, e: + pass # Clear out the markup and remove the builder's circular # reference to this object. @@ -353,7 +352,6 @@ class BeautifulStoneSoup(BeautifulSoup): class StopParsing(Exception): pass - class FeatureNotFound(ValueError): pass diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index bae453e..e59dae2 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -296,6 +296,9 @@ def register_treebuilders_from(module): # Register the builder while we're at it. this_module.builder_registry.register(obj) +class ParserRejectedMarkup(Exception): + pass + # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index e439ac8..3bbc9a9 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -27,7 +27,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding - return markup, None, None, False + yield (markup, None, None, False) # These methods are defined by Beautiful Soup. def feed(self, markup): diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index e34c9fa..2b98969 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -133,13 +133,14 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): - return markup, None, None, False + yield (markup, None, None, False) + return try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) def feed(self, markup): args, kwargs = self.parser_args diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index be35d70..601b793 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -13,9 +13,10 @@ from bs4.builder import ( HTML, HTMLTreeBuilder, PERMISSIVE, + ParserRejectedMarkup, TreeBuilder, XML) -from bs4.dammit import UnicodeDammit +from bs4.dammit import EncodingDetector LXML = 'lxml' @@ -33,22 +34,30 @@ class LXMLTreeBuilderForXML(TreeBuilder): # standard. DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} - @property - def default_parser(self): + def default_parser(self, encoding): # This can either return a parser object or a class, which # will be instantiated with default arguments. - return etree.XMLParser(target=self, strip_cdata=False, recover=True) + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser def __init__(self, parser=None, empty_element_tags=None): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) - if parser is None: - # Use the default parser. - parser = self.default_parser - if isinstance(parser, collections.Callable): - # Instantiate the parser with default arguments - parser = parser(target=self, strip_cdata=False) - self.parser = parser self.soup = None self.nsmaps = [self.DEFAULT_NSMAPS] @@ -63,33 +72,53 @@ class LXMLTreeBuilderForXML(TreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): """ - :return: A 3-tuple (markup, original encoding, encoding - declared within markup). + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. """ if isinstance(markup, unicode): - return markup, None, None, False + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + if isinstance(markup, unicode): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) + detector = EncodingDetector(markup, try_encodings, is_html) + for encoding in detector.encodings: + yield (markup, encoding, document_declared_encoding, False) def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) elif isinstance(markup, unicode): markup = StringIO(markup) + # Call feed() at least once, even if the markup is empty, # or the parser won't be initialized. data = markup.read(self.CHUNK_SIZE) - self.parser.feed(data) - while data != '': - # Now call feed() on the rest of the data, chunk by chunk. - data = markup.read(self.CHUNK_SIZE) - if data != '': - self.parser.feed(data) - self.parser.close() + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError), e: + raise ParserRejectedMarkup(str(e)) def close(self): self.nsmaps = [self.DEFAULT_NSMAPS] @@ -186,13 +215,18 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = [LXML, HTML, FAST, PERMISSIVE] is_xml = False - @property - def default_parser(self): + def default_parser(self, encoding): return etree.HTMLParser def feed(self, markup): - self.parser.feed(markup) - self.parser.close() + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError), e: + raise ParserRejectedMarkup(str(e)) + def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" diff --git a/bs4/dammit.py b/bs4/dammit.py index cb6d354..a8acef9 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -224,9 +224,11 @@ class EncodingDetector: self.sniffed_encoding = None def _usable(self, encoding, tried): - if encoding not in tried and encoding is not None: - tried.add(encoding) - return True + if encoding is not None: + encoding = encoding.lower() + if encoding not in tried: + tried.add(encoding) + return True return False @property @@ -386,18 +388,17 @@ class UnicodeDammit: def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False): - self.declared_html_encoding = None self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.detector = EncodingDetector(markup, override_encodings, is_html) if markup == '' or isinstance(markup, unicode): self.markup = markup self.unicode_markup = unicode(markup) self.original_encoding = None return - self.detector = EncodingDetector(markup, override_encodings, is_html) self.markup, ignore = self.detector.strip_byte_order_mark(markup) u = None @@ -496,6 +497,16 @@ class UnicodeDammit: newdata = unicode(data, encoding, errors) return newdata + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + @property + def is_html(self): + return self.detector.is_html + def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ diff --git a/bs4/testing.py b/bs4/testing.py index d8ff6b7..c363a89 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -279,6 +279,14 @@ class HTMLTreeBuilderSmokeTest(object): # to detect any differences between them. # + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") @@ -482,6 +490,11 @@ class XMLTreeBuilderSmokeTest(object): encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded) + def test_can_parse_unicode_document(self): + markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + def test_popping_namespaced_tag(self): markup = 'b2012-07-02T20:33:42Zcd' soup = self.soup(markup) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 80458de..27cb2d9 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -4,14 +4,16 @@ import re import warnings try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML - LXML_PRESENT = True import lxml.etree + LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION except ImportError, e: LXML_PRESENT = False LXML_VERSION = (0,) +if LXML_PRESENT: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + from bs4 import ( BeautifulSoup, BeautifulStoneSoup, -- cgit v1.2.3 From 85f183394e0b402a86ce754c27fd95f4bbab4cac Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 2 Jun 2013 13:42:49 -0400 Subject: Treat an lxml ParserError as a ParserRejectedMarkup. --- bs4/builder/_lxml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 601b793..8661c78 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -117,7 +117,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError), e: raise ParserRejectedMarkup(str(e)) def close(self): -- cgit v1.2.3 From 6d9221a42dd3fdd679b8e222cb9c73065eeeb747 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 2 Jun 2013 18:28:03 -0400 Subject: It turns out most of the untested code wasn't doing anything useful. --- bs4/builder/_lxml.py | 2 +- bs4/dammit.py | 128 ++++++++----------------------------------------- bs4/tests/test_soup.py | 7 ++- 3 files changed, 26 insertions(+), 111 deletions(-) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 8661c78..92ace07 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -224,7 +224,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError), e: raise ParserRejectedMarkup(str(e)) diff --git a/bs4/dammit.py b/bs4/dammit.py index a8acef9..a5558d7 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -200,20 +200,17 @@ class EncodingDetector: 1. Encodings you specifically tell EncodingDetector to try first (the override_encodings argument to the constructor). - 2. An encoding implied by a short substring at the beginning of - the bytestring, such as a byte-order mark. - - 3. An encoding declared within the bytestring itself, either in an + 2. An encoding declared within the bytestring itself, either in an XML declaration (if the bytestring is to be interpreted as an XML document), or in a tag (if the bytestring is to be interpreted as an HTML document.) - 4. An encoding detected through textual analysis by chardet, + 3. An encoding detected through textual analysis by chardet, cchardet, or a similar external library. - 5. UTF-8. + 4. UTF-8. - 6. Windows-1252. + 5. Windows-1252. """ def __init__(self, markup, override_encodings=None, is_html=False): self.markup = markup @@ -239,31 +236,12 @@ class EncodingDetector: if self._usable(e, tried): yield e - if self.sniffed_encoding is None: - (self.markup, - self.sniffed_encoding) = self.strip_byte_order_mark(self.markup) - if self._usable(self.sniffed_encoding, tried): - yield self.sniffed_encoding - if self.declared_encoding is None: + # Look within the document for an XML or HTML encoding + # declaration. self.declared_encoding = self.find_declared_encoding( self.markup, self.is_html) - if self.sniffed_encoding and self.declared_encoding in ( - 'iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16'): - # We were able to sniff an encoding by looking at the - # first part of the document. The declared encoding is redundant - # with the sniffed encoding. - # - # TODO: Is it really? How do we know? What if the BOM - # says UTF-32 and the declaration says UTF-16? In real - # usage this doesn't matter because this method is - # only called if the sniffed encoding didn't work. - self.declared_encoding = self.sniffed_encoding - if self._usable(self.declared_encoding, tried): yield self.declared_encoding @@ -277,52 +255,6 @@ class EncodingDetector: if self._usable(e, tried): yield e - @classmethod - def strip_byte_order_mark(cls, markup): - "Remove a byte-order mark from a document, and guess at its encoding." - if markup[:4] == b'\x4c\x6f\xa7\x94': - # EBCDIC - # There's no 'ebcdic' codec, so just convert the ebsdic to ASCII. - markup = self.ebcdic_to_ascii(markup) - elif markup[:4] == b'\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_encoding = 'utf-16be' - elif (len(markup) >= 4) and (markup[:2] == b'\xfe\xff') \ - and (markup[2:4] != b'\x00\x00'): - # UTF-16BE with BOM - sniffed_encoding = 'utf-16be' - markup = markup[2:] - elif markup[:4] == b'\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_encoding = 'utf-16le' - elif (len(markup) >= 4) and (markup[:2] == b'\xff\xfe') and \ - (markup[2:4] != b'\x00\x00'): - # UTF-16LE with BOM - sniffed_encoding = 'utf-16le' - markup = markup[2:] - elif markup[:4] == b'\x00\x00\x00\x3c': - # UTF-32BE - sniffed_encoding = 'utf-32be' - elif markup[:4] == b'\x3c\x00\x00\x00': - # UTF-32LE - sniffed_encoding = 'utf-32le' - elif markup[:4] == b'\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_encoding = 'utf-32be' - markup = markup[4:] - elif markup[:4] == b'\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_encoding = 'utf-32le' - markup = markup[4:] - elif markup[:3] == b'\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_encoding = 'utf-8' - markup = markup[3:] - else: - # No idea. - sniffed_encoding = None - return markup, sniffed_encoding - @classmethod def find_declared_encoding(cls, markup, is_html=False): """Given a document, tries to find its declared encoding. @@ -337,35 +269,10 @@ class EncodingDetector: declared_encoding_match = html_meta_re.search(markup) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii').lower() - return declared_encoding - - EBCDIC_TO_ASCII_MAP = None - - @classmethod - def ebcdic_to_ascii(cls, s): - if not cls.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - cls.EBCDIC_TO_ASCII_MAP = string.maketrans( - ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) - return s.translate(cls.EBCDIC_TO_ASCII_MAP) - + 'ascii') + if declared_encoding: + return declared_encoding.lower() + return None class UnicodeDammit: """A class for detecting the encoding of a *ML document and @@ -399,7 +306,7 @@ class UnicodeDammit: self.original_encoding = None return - self.markup, ignore = self.detector.strip_byte_order_mark(markup) + self.markup = markup u = None for encoding in self.detector.encodings: @@ -454,7 +361,7 @@ class UnicodeDammit: # Convert smart quotes to HTML if coming from an encoding # that might have them. if (self.smart_quotes_to is not None - and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) @@ -508,10 +415,15 @@ class UnicodeDammit: return self.detector.is_html def find_codec(self, charset): - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) or charset + ) + if value: + return value.lower() + return None def _codec(self, charset): if not charset: diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index b8a1aed..c275228 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -15,7 +15,10 @@ from bs4.element import ( NamespacedAttribute, ) import bs4.dammit -from bs4.dammit import EntitySubstitution, UnicodeDammit +from bs4.dammit import ( + EntitySubstitution, + UnicodeDammit, +) from bs4.testing import ( SoupTest, skipIf, @@ -202,7 +205,7 @@ class TestEncodingConversion(SoupTest): self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of Unicode, Dammit.""" + """Standalone tests of UnicodeDammit.""" def test_smart_quotes_to_unicode(self): markup = b"\x91\x92\x93\x94" -- cgit v1.2.3 From 4f9a654766df9ddd05e3ef274b4715b42668724f Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sun, 2 Jun 2013 19:12:07 -0400 Subject: Turns out we had two bits of code to strip byte-order marks. --- bs4/builder/_lxml.py | 2 +- bs4/dammit.py | 77 ++++++++++++++++++++++++++++---------------------- bs4/tests/test_soup.py | 5 ++-- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 92ace07..fa5d498 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -97,7 +97,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector(markup, try_encodings, is_html) for encoding in detector.encodings: - yield (markup, encoding, document_declared_encoding, False) + yield (detector.markup, encoding, document_declared_encoding, False) def feed(self, markup): if isinstance(markup, bytes): diff --git a/bs4/dammit.py b/bs4/dammit.py index a5558d7..9ea432f 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -213,12 +213,13 @@ class EncodingDetector: 5. Windows-1252. """ def __init__(self, markup, override_encodings=None, is_html=False): - self.markup = markup self.override_encodings = override_encodings or [] self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None - self.sniffed_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) def _usable(self, encoding, tried): if encoding is not None: @@ -236,15 +237,21 @@ class EncodingDetector: if self._usable(e, tried): yield e + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. if self.declared_encoding is None: - # Look within the document for an XML or HTML encoding - # declaration. self.declared_encoding = self.find_declared_encoding( self.markup, self.is_html) - if self._usable(self.declared_encoding, tried): yield self.declared_encoding + # Use third-party character set detection to guess at the + # encoding. if self.chardet_encoding is None: self.chardet_encoding = chardet_dammit(self.markup) if self._usable(self.chardet_encoding, tried): @@ -255,6 +262,29 @@ class EncodingDetector: if self._usable(e, tried): yield e + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + @classmethod def find_declared_encoding(cls, markup, is_html=False): """Given a document, tries to find its declared encoding. @@ -298,18 +328,21 @@ class UnicodeDammit: self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.is_html = is_html self.detector = EncodingDetector(markup, override_encodings, is_html) - if markup == '' or isinstance(markup, unicode): + + # Is the data in Unicode to begin with? + if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) - self.original_encoding = None - return - self.markup = markup + # As a first step, the encoding detector may strip a byte-order mark. + self.markup = self.detector.markup u = None for encoding in self.detector.encodings: + markup = self.detector.markup u = self._convert_from(encoding) if u is not None: break @@ -382,27 +415,7 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' - - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding, errors) - return newdata + return unicode(data, encoding, errors) @property def declared_html_encoding(self): @@ -410,10 +423,6 @@ class UnicodeDammit: return None return self.detector.declared_encoding - @property - def is_html(self): - return self.detector.is_html - def find_codec(self, charset): value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) or (charset and self._codec(charset.replace("-", ""))) diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index c275228..0b69318 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -306,9 +306,8 @@ class TestUnicodeDammit(unittest.TestCase): logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet - def test_sniffed_xml_encoding(self): - # A document written in UTF-16LE will be converted by a different - # code path that sniffs the byte order markers. + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"áé", dammit.unicode_markup) -- cgit v1.2.3