diff options
-rw-r--r-- | bs4/builder/__init__.py | 263 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 94 | ||||
-rw-r--r-- | bs4/dammit.py | 412 | ||||
-rw-r--r-- | tests/test_builder_registry.py | 8 | ||||
-rw-r--r-- | tests/test_htmlparser.py | 126 | ||||
-rw-r--r-- | tests/test_lxml.py | 23 |
6 files changed, 903 insertions, 23 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py new file mode 100644 index 0000000..17dcff3 --- /dev/null +++ b/bs4/builder/__init__.py @@ -0,0 +1,263 @@ +from collections import defaultdict +import re +import sys + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + features = [] + + is_xml = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + def __init__(self): + self.soup = None + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a <p> tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty <p> tag + will be presented as "<p></p>", not "<p />". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty <head> tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + pass + + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in attrs.items()) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + preserve_whitespace_tags = set(['pre', 'textarea']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + # Used by set_up_substitutions to detect the charset in a META tag + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def set_up_substitutions(self, tag): + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + + if (http_equiv is not None + and content is not None + and http_equiv.lower() == 'content-type'): + # This is an interesting meta tag. + match = self.CHARSET_RE.search(content) + if match: + if (self.soup.declared_html_encoding is not None or + self.soup.original_encoding == self.soup.from_encoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + tag['content'] = self.CHARSET_RE.sub(rewrite, content) + return True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + new_charset = match.group(3) + if (new_charset is not None + and new_charset != self.soup.original_encoding): + self.soup.declared_html_encoding = new_charset + self.soup._feed(self.soup.declared_html_encoding) + raise StopParsing + pass + return False + + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules[__package__] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want +# html5lib to take precedence over lxml, because it's more +# reliable. And we only want to use HTMLParser as a last result. +import _htmlparser +register_treebuilders_from(_htmlparser) +try: + import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass +try: + import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..c293d9e --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,94 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import HTMLParser +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): + + is_xml = False + features = [HTML, STRICT, HTMLPARSER] + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, isHTML=True) + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) + + def feed(self, markup): + super(HTMLParserTreeBuilder, self).feed(markup) + + def handle_starttag(self, name, attrs): + self.soup.handle_starttag(name, dict(attrs)) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + diff --git a/bs4/dammit.py b/bs4/dammit.py new file mode 100644 index 0000000..75d445e --- /dev/null +++ b/bs4/dammit.py @@ -0,0 +1,412 @@ +"""Beautiful Soup bonus library: Unicode, Dammit + +This class forces XML data into a standard format (usually to UTF-8 or +Unicode). It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It does not rewrite the XML or HTML to reflect a new +encoding; that's the tree builder's job. +""" + +import codecs +from htmlentitydefs import codepoint2name +import re +import types + +# Autodetects character encodings. Very useful. +# Download from http://chardet.feedparser.org/ +# or 'apt-get install python-chardet' +# or 'easy_install chardet' +try: + import chardet + #import chardet.constants + #chardet.constants._debug = 1 +except ImportError: + chardet = None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters = [] + for codepoint, name in codepoint2name.items(): + if codepoint == 34: + # There's no point in turning the quotation mark into + # ", unless it happens within an attribute value, which + # is handled elsewhere. + continue; + character = unichr(codepoint) + characters.append(character) + lookup[character] = name + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + + CHARACTER_TO_XML_ENTITY = { + "'" : "apos", + '"' : "quot", + "&" : "amp", + "<" : "lt", + ">" : "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + + Ordinarily, the string will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If the string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If the string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + if make_quoted_attribute: + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + if make_quoted_attribute: + return quote_with + value + quote_with + else: + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = { "macintosh" : "mac-roman", + "x-sjis" : "shift-jis" } + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, isHTML=False): + self.declared_html_encoding = None + self.markup, document_encoding, sniffed_encoding = \ + self._detectEncoding(markup, isHTML) + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + if markup == '' or isinstance(markup, unicode): + self.original_encoding = None + self.unicode = unicode(markup) + return + + u = None + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) + if u: + break + + # If no luck and we have auto-detection library, try that: + if not u and chardet and not isinstance(self.markup, unicode): + u = self._convert_from(chardet.detect(self.markup)['encoding']) + + # As a last resort, try utf-8 and windows-1252: + if not u: + for proposed_encoding in ("utf-8", "windows-1252"): + u = self._convert_from(proposed_encoding) + if u: + break + + self.unicode = u + if not u: self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity.""" + orig = match.group(1) + sub = self.MS_CHARS.get(orig) + if type(sub) == types.TupleType: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed): + proposed = self.find_codec(proposed) + if not proposed or proposed in self.tried_encodings: + return None + self.tried_encodings.append(proposed) + markup = self.markup + + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = "([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + # print "Trying to convert document to %s" % proposed + u = self._to_unicode(markup, proposed) + self.markup = u + self.original_encoding = proposed + except Exception, e: + # print "That didn't work!" + # print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + + # strip Byte Order Mark (if present) + if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == '\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == '\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == '\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + newdata = unicode(data, encoding) + return newdata + + def _detectEncoding(self, xml_data, isHTML=False): + """Given a document, tries to detect its XML encoding.""" + xml_encoding = sniffed_xml_encoding = None + try: + if xml_data[:4] == '\x4c\x6f\xa7\x94': + # EBCDIC + xml_data = self._ebcdic_to_ascii(xml_data) + elif xml_data[:4] == '\x00\x3c\x00\x3f': + # UTF-16BE + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ + and (xml_data[2:4] != '\x00\x00'): + # UTF-16BE with BOM + sniffed_xml_encoding = 'utf-16be' + xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x3f\x00': + # UTF-16LE + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ + (xml_data[2:4] != '\x00\x00'): + # UTF-16LE with BOM + sniffed_xml_encoding = 'utf-16le' + xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\x00\x3c': + # UTF-32BE + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\x3c\x00\x00\x00': + # UTF-32LE + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + elif xml_data[:4] == '\x00\x00\xfe\xff': + # UTF-32BE with BOM + sniffed_xml_encoding = 'utf-32be' + xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + elif xml_data[:4] == '\xff\xfe\x00\x00': + # UTF-32LE with BOM + sniffed_xml_encoding = 'utf-32le' + xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + elif xml_data[:3] == '\xef\xbb\xbf': + # UTF-8 with BOM + sniffed_xml_encoding = 'utf-8' + xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + else: + sniffed_xml_encoding = 'ascii' + pass + except: + xml_encoding_match = None + xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() + xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) + if not xml_encoding_match and isHTML: + meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() + regexp = re.compile(meta_re, re.I) + xml_encoding_match = regexp.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if isHTML: + self.declared_html_encoding = xml_encoding + if sniffed_xml_encoding and \ + (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', + 'iso-10646-ucs-4', 'ucs-4', 'csucs4', + 'utf-16', 'utf-32', 'utf_16', 'utf_32', + 'utf16', 'u16')): + xml_encoding = sniffed_xml_encoding + return xml_data, xml_encoding, sniffed_xml_encoding + + + def find_codec(self, charset): + return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ + or (charset and self._codec(charset.replace("-", ""))) \ + or (charset and self._codec(charset.replace("-", "_"))) \ + or charset + + def _codec(self, charset): + if not charset: return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): + c = self.__class__ + if not c.EBCDIC_TO_ASCII_MAP: + emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, + 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, + 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, + 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, + 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, + 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, + 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, + 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, + 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, + 201,202,106,107,108,109,110,111,112,113,114,203,204,205, + 206,207,208,209,126,115,116,117,118,119,120,121,122,210, + 211,212,213,214,215,216,217,218,219,220,221,222,223,224, + 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, + 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, + 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, + 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, + 250,251,252,253,254,255) + import string + c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ + ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + return s.translate(c.EBCDIC_TO_ASCII_MAP) + + MS_CHARS = { '\x80' : ('euro', '20AC'), + '\x81' : ' ', + '\x82' : ('sbquo', '201A'), + '\x83' : ('fnof', '192'), + '\x84' : ('bdquo', '201E'), + '\x85' : ('hellip', '2026'), + '\x86' : ('dagger', '2020'), + '\x87' : ('Dagger', '2021'), + '\x88' : ('circ', '2C6'), + '\x89' : ('permil', '2030'), + '\x8A' : ('Scaron', '160'), + '\x8B' : ('lsaquo', '2039'), + '\x8C' : ('OElig', '152'), + '\x8D' : '?', + '\x8E' : ('#x17D', '17D'), + '\x8F' : '?', + '\x90' : '?', + '\x91' : ('lsquo', '2018'), + '\x92' : ('rsquo', '2019'), + '\x93' : ('ldquo', '201C'), + '\x94' : ('rdquo', '201D'), + '\x95' : ('bull', '2022'), + '\x96' : ('ndash', '2013'), + '\x97' : ('mdash', '2014'), + '\x98' : ('tilde', '2DC'), + '\x99' : ('trade', '2122'), + '\x9a' : ('scaron', '161'), + '\x9b' : ('rsaquo', '203A'), + '\x9c' : ('oelig', '153'), + '\x9d' : '?', + '\x9e' : ('#x17E', '17E'), + '\x9f' : ('Yuml', ''),} diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py index ee5b2da..655cd06 100644 --- a/tests/test_builder_registry.py +++ b/tests/test_builder_registry.py @@ -8,7 +8,8 @@ from bs4.builder import ( LXMLTreeBuilderForXML, LXMLTreeBuilder, TreeBuilderRegistry, - HTML5TreeBuilder + HTML5TreeBuilder, + HTMLParserTreeBuilder, ) @@ -21,6 +22,8 @@ class BuiltInRegistryTest(unittest.TestCase): LXMLTreeBuilder) self.assertEquals(registry.lookup('permissive', 'xml'), LXMLTreeBuilderForXML) + self.assertEquals(registry.lookup('strict', 'html'), + HTMLParserTreeBuilder) self.assertEquals(registry.lookup('permissive', 'html'), HTML5TreeBuilder) @@ -36,6 +39,9 @@ class BuiltInRegistryTest(unittest.TestCase): self.assertEquals(registry.lookup('html5lib'), HTML5TreeBuilder) + self.assertEquals(registry.lookup('html.parser'), + HTMLParserTreeBuilder) + def test_unimplemented_combinations(self): self.assertEquals(registry.lookup('fast', 'permissive', 'html'), None) diff --git a/tests/test_htmlparser.py b/tests/test_htmlparser.py new file mode 100644 index 0000000..c8a446e --- /dev/null +++ b/tests/test_htmlparser.py @@ -0,0 +1,126 @@ +from HTMLParser import HTMLParseError +from bs4.builder import HTMLParserTreeBuilder +from bs4.element import CData +from test_lxml import ( + TestLXMLBuilder, + TestLXMLBuilderEncodingConversion, + TestLXMLBuilderInvalidMarkup, + ) + +class TestHTMLParserTreeBuilder(TestLXMLBuilder): + """See `BuilderSmokeTest`.""" + + @property + def default_builder(self): + return HTMLParserTreeBuilder() + + def test_bare_string(self): + # A bare string is turned into some kind of HTML document or + # fragment recognizable as the original string. + # + # HTMLParser does not modify the bare string at all. + self.assertSoupEquals("A bare string") + + def test_cdata_where_its_ok(self): + # HTMLParser recognizes CDATA sections and passes them through. + markup = "<svg><![CDATA[foobar]]></svg>" + self.assertSoupEquals(markup) + soup = self.soup(markup) + string = soup.svg.string + self.assertEquals(string, "foobar") + self.assertTrue(isinstance(string, CData)) + + # These are tests that could be 'fixed' by improving the + # HTMLParserTreeBuilder, but I don't think it's worth it. Users + # will have fewer headaches if they use one of the other tree + # builders. + + def test_empty_element(self): + # HTML's empty-element tags are not recognized as such + # unless they are presented as empty-element tags. + self.assertSoupEquals( + "<p>A <meta> tag</p>", "<p>A <meta> tag</meta></p>") + + self.assertSoupEquals( + "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") + + def test_entities_in_attribute_values_converted_during_parsing(self): + + # The numeric entity isn't recognized without the closing + # semicolon. + text = '<x t="piñata">' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], "piñata") + + text = '<x t="piñata">' + expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" + soup = self.soup(text) + self.assertEquals(soup.x['t'], u"pi\xf1ata") + + text = '<x t="piñata">' + soup = self.soup(text) + self.assertEquals(soup.x['t'], expected) + + text = '<x t="sacré bleu">' + soup = self.soup(text) + self.assertEquals( + soup.x['t'], + u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") + + # This can cause valid HTML to become invalid. + valid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' + soup = self.soup(valid_url) + self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") + + # I think it would be very difficult to 'fix' these tests, judging + # from my experience with previous versions of Beautiful Soup. + def test_naked_ampersands(self): + # Ampersands are treated as entities. + text = "<p>AT&T</p>" + soup = self.soup(text) + self.assertEquals(soup.p.string, "AT&T;") + + def test_literal_in_textarea(self): + # Anything inside a <textarea> is supposed to be treated as + # the literal value of the field, (XXX citation + # needed). html5lib does this correctly. But, HTMLParser does its + # best to parse the contents of a <textarea> as HTML. + text = '<textarea>Junk like <b> tags and <&<&</textarea>' + soup = self.soup(text) + self.assertEquals(len(soup.textarea.contents), 2) + self.assertEquals(soup.textarea.contents[0], u"Junk like ") + self.assertEquals(soup.textarea.contents[1].name, 'b') + self.assertEquals(soup.textarea.b.string, u" tags and <&<&") + + def test_literal_in_script(self): + # The contents of a <script> tag are supposed to be treated as + # a literal string, even if that string contains HTML. But + # HTMLParser attempts to parse some of the HTML, causing much + # pain. + javascript = 'if (i < 2) { alert("<b>foo</b>"); }' + soup = self.soup('<script>%s</script>' % javascript) + self.assertEquals(soup.script.contents, + ['if (i < 2) { alert("<b>foo', + '"); }']) + + # Namespaced doctypes cause an HTMLParseError + def test_namespaced_system_doctype(self): + self.assertRaises(HTMLParseError, self._test_doctype, + 'xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + self.assertRaises(HTMLParseError, self._test_doctype, + 'xsl:stylesheet PUBLIC "htmlent.dtd"') + + +class TestHTMLParserTreeBuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): + # Oddly enough, HTMLParser seems to handle invalid markup exactly + # the same as lxml. + pass + + +class TestHTMLParserTreeBuilderEncodingConversion( + TestLXMLBuilderEncodingConversion): + # Re-run the lxml tests for HTMLParser + pass diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 0eec688..7e83eff 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -168,27 +168,6 @@ class TestLXMLBuilder(SoupTest): expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" self.assertSoupEquals(text, expected) - def test_entities_in_attribute_values_converted_during_parsing(self): - text = '<x t="piñata">' - expected = u"pi\N{LATIN SMALL LETTER N WITH TILDE}ata" - soup = self.soup(text) - self.assertEquals(soup.x['t'], expected) - - text = '<x t="piñata">' - soup = self.soup(text) - self.assertEquals(soup.x['t'], expected) - - text = '<x t="sacré bleu">' - soup = self.soup(text) - self.assertEquals( - soup.x['t'], - u"sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu") - - # This can cause valid HTML to become invalid. - valid_url = '<a href="http://example.org?a=1&b=2;3">foo</a>' - soup = self.soup(valid_url) - self.assertEquals(soup.a['href'], "http://example.org?a=1&b=2;3") - def test_smart_quotes_converted_on_the_way_in(self): # Microsoft smart quotes are converted to Unicode characters during # parsing. @@ -230,7 +209,7 @@ class TestLXMLBuilder(SoupTest): # Test a namespaced doctype with a system id. self._test_doctype('xsl:stylesheet SYSTEM "htmlent.dtd"') - def test_namespaced_system_doctype(self): + def test_namespaced_public_doctype(self): # Test a namespaced doctype with a public id. self._test_doctype('xsl:stylesheet PUBLIC "htmlent.dtd"') |