diff options
-rw-r--r-- | NEWS.txt | 7 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 14 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 18 | ||||
-rw-r--r-- | bs4/testing.py | 8 |
4 files changed, 40 insertions, 7 deletions
@@ -40,6 +40,13 @@ displayed correctly even if the filename or URL is a Unicode string. [bug=1268888] +* If the initial <html> tag contains a CDATA list attribute such as + 'class', the html5lib tree builder will now turn its value into a + list, as it would with any other tag. [bug=1296481] + +* Fixed an import error in Python 3.5 caused by the removal of the + HTMLParseError class. [bug=1420063] + * Improved docstring for encode_contents() and decode_contents(). [bug=1441543] diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index ea8ff43..ad3c6ef 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -9,7 +9,10 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) -from bs4.element import NamespacedAttribute +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) import html5lib from html5lib.constants import namespaces from bs4.element import ( @@ -103,7 +106,13 @@ class AttrList(object): def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): - "set attr", name, value + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + value = whitespace_re.split(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -180,6 +189,7 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: converted_attributes = [] diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 7f3ae73..b2cd467 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -4,10 +4,16 @@ __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import ( - HTMLParser, - HTMLParseError, - ) +from HTMLParser import HTMLParser + +try: + from HTMLParser import HTMLParseError +except ImportError, e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + import sys import warnings @@ -20,8 +26,10 @@ import warnings # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + from bs4.element import ( CData, Comment, @@ -123,7 +131,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: kwargs['strict'] = False if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: kwargs['convert_charrefs'] = False diff --git a/bs4/testing.py b/bs4/testing.py index dfaa047..8ca3878 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -243,6 +243,14 @@ Hello, world! soup = self.soup(markup) self.assertEqual(["css"], soup.div.div['class']) + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # <html> tag. This has caused problems with multivalued + # attributes. + markup = '<html class="a b"></html>' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') |