Merge in HEAD.

author: Leonard Richardson <leonardr@segfault.org> 2015-06-25 06:34:26 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2015-06-25 06:34:26 -0400
commit: 569c50e1744543323e31d3e24aa1c43f0555dc43 (patch)
tree: 52f226b9925970fc254e20a9a9414fc85004a636
parent: 6c03df57433b5dc1faa3f31f35f5f3fcd68c68c1 (diff)
parent: 0c721728089eae05a0cab8ca04307dc859587470 (diff)
4 files changed, 40 insertions, 7 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 92765e1..e3c5938 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -40,6 +40,13 @@
   displayed correctly even if the filename or URL is a Unicode
   string. [bug=1268888]
 
+* If the initial <html> tag contains a CDATA list attribute such as
+  'class', the html5lib tree builder will now turn its value into a
+  list, as it would with any other tag. [bug=1296481]
+
+* Fixed an import error in Python 3.5 caused by the removal of the
+  HTMLParseError class. [bug=1420063]
+
 * Improved docstring for encode_contents() and
   decode_contents(). [bug=1441543]
 
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index ea8ff43..ad3c6ef 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -9,7 +9,10 @@ from bs4.builder import (
     HTML_5,
     HTMLTreeBuilder,
     )
-from bs4.element import NamespacedAttribute
+from bs4.element import (
+    NamespacedAttribute,
+    whitespace_re,
+)
 import html5lib
 from html5lib.constants import namespaces
 from bs4.element import (
@@ -103,7 +106,13 @@ class AttrList(object):
     def __iter__(self):
         return list(self.attrs.items()).__iter__()
     def __setitem__(self, name, value):
-        "set attr", name, value
+        # If this attribute is a multi-valued attribute for this element,
+        # turn its value into a list.
+        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        if (name in list_attr['*']
+            or (self.element.name in list_attr
+                and name in list_attr[self.element.name])):
+            value = whitespace_re.split(value)
         self.element[name] = value
     def items(self):
         return list(self.attrs.items())
@@ -180,6 +189,7 @@ class Element(html5lib.treebuilders._base.Node):
         return AttrList(self.element)
 
     def setAttributes(self, attributes):
+
         if attributes is not None and len(attributes) > 0:
 
             converted_attributes = []
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 7f3ae73..b2cd467 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -4,10 +4,16 @@ __all__ = [
     'HTMLParserTreeBuilder',
     ]
 
-from HTMLParser import (
-    HTMLParser,
-    HTMLParseError,
-    )
+from HTMLParser import HTMLParser
+
+try:
+    from HTMLParser import HTMLParseError
+except ImportError, e:
+    # HTMLParseError is removed in Python 3.5. Since it can never be
+    # thrown in 3.5, we can just define our own class as a placeholder.
+    class HTMLParseError(Exception):
+        pass
+
 import sys
 import warnings
 
@@ -20,8 +26,10 @@ import warnings
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
 CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
 CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
 
+
 from bs4.element import (
     CData,
     Comment,
@@ -123,7 +131,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     features = [NAME, HTML, STRICT]
 
     def __init__(self, *args, **kwargs):
-        if CONSTRUCTOR_TAKES_STRICT:
+        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
             kwargs['strict'] = False
         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
             kwargs['convert_charrefs'] = False
diff --git a/bs4/testing.py b/bs4/testing.py
index dfaa047..8ca3878 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -243,6 +243,14 @@ Hello, world!
         soup = self.soup(markup)
         self.assertEqual(["css"], soup.div.div['class'])
 
+    def test_multivalued_attribute_on_html(self):
+        # html5lib uses a different API to set the attributes ot the
+        # <html> tag. This has caused problems with multivalued
+        # attributes.
+        markup = '<html class="a b"></html>'
+        soup = self.soup(markup)
+        self.assertEqual(["a", "b"], soup.html['class'])
+
     def test_angle_brackets_in_attribute_values_are_escaped(self):
         self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
author	Leonard Richardson <leonardr@segfault.org>	2015-06-25 06:34:26 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2015-06-25 06:34:26 -0400
commit	569c50e1744543323e31d3e24aa1c43f0555dc43 (patch)
tree	52f226b9925970fc254e20a9a9414fc85004a636
parent	6c03df57433b5dc1faa3f31f35f5f3fcd68c68c1 (diff)
parent	0c721728089eae05a0cab8ca04307dc859587470 (diff)