summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2015-06-25 06:34:26 -0400
committerLeonard Richardson <leonardr@segfault.org>2015-06-25 06:34:26 -0400
commit569c50e1744543323e31d3e24aa1c43f0555dc43 (patch)
tree52f226b9925970fc254e20a9a9414fc85004a636
parent6c03df57433b5dc1faa3f31f35f5f3fcd68c68c1 (diff)
parent0c721728089eae05a0cab8ca04307dc859587470 (diff)
Merge in HEAD.
-rw-r--r--NEWS.txt7
-rw-r--r--bs4/builder/_html5lib.py14
-rw-r--r--bs4/builder/_htmlparser.py18
-rw-r--r--bs4/testing.py8
4 files changed, 40 insertions, 7 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 92765e1..e3c5938 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -40,6 +40,13 @@
displayed correctly even if the filename or URL is a Unicode
string. [bug=1268888]
+* If the initial <html> tag contains a CDATA list attribute such as
+ 'class', the html5lib tree builder will now turn its value into a
+ list, as it would with any other tag. [bug=1296481]
+
+* Fixed an import error in Python 3.5 caused by the removal of the
+ HTMLParseError class. [bug=1420063]
+
* Improved docstring for encode_contents() and
decode_contents(). [bug=1441543]
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index ea8ff43..ad3c6ef 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -9,7 +9,10 @@ from bs4.builder import (
HTML_5,
HTMLTreeBuilder,
)
-from bs4.element import NamespacedAttribute
+from bs4.element import (
+ NamespacedAttribute,
+ whitespace_re,
+)
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
@@ -103,7 +106,13 @@ class AttrList(object):
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
- "set attr", name, value
+ # If this attribute is a multi-valued attribute for this element,
+ # turn its value into a list.
+ list_attr = HTML5TreeBuilder.cdata_list_attributes
+ if (name in list_attr['*']
+ or (self.element.name in list_attr
+ and name in list_attr[self.element.name])):
+ value = whitespace_re.split(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
@@ -180,6 +189,7 @@ class Element(html5lib.treebuilders._base.Node):
return AttrList(self.element)
def setAttributes(self, attributes):
+
if attributes is not None and len(attributes) > 0:
converted_attributes = []
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 7f3ae73..b2cd467 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -4,10 +4,16 @@ __all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import (
- HTMLParser,
- HTMLParseError,
- )
+from HTMLParser import HTMLParser
+
+try:
+ from HTMLParser import HTMLParseError
+except ImportError, e:
+ # HTMLParseError is removed in Python 3.5. Since it can never be
+ # thrown in 3.5, we can just define our own class as a placeholder.
+ class HTMLParseError(Exception):
+ pass
+
import sys
import warnings
@@ -20,8 +26,10 @@ import warnings
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
+
from bs4.element import (
CData,
Comment,
@@ -123,7 +131,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs):
- if CONSTRUCTOR_TAKES_STRICT:
+ if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False
diff --git a/bs4/testing.py b/bs4/testing.py
index dfaa047..8ca3878 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -243,6 +243,14 @@ Hello, world!
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
+ def test_multivalued_attribute_on_html(self):
+ # html5lib uses a different API to set the attributes ot the
+ # <html> tag. This has caused problems with multivalued
+ # attributes.
+ markup = '<html class="a b"></html>'
+ soup = self.soup(markup)
+ self.assertEqual(["a", "b"], soup.html['class'])
+
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')