summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2012-06-30 10:43:47 -0400
committerLeonard Richardson <leonardr@segfault.org>2012-06-30 10:43:47 -0400
commit7a47c96f352aafcae4ad280e4d3d8456b53d7ffe (patch)
tree15413efa4810c902b4d3cc5222e905fd6abc8627
parent093ec128d5732b02e75df2566c7db2c6e381d766 (diff)
Fixed an html5lib tree builder crash which happened when html5lib
moved a tag with a multivalued attribute from one part of the tree to another. [bug=1019603]
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/builder/__init__.py11
-rw-r--r--bs4/testing.py8
3 files changed, 24 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 0443968..2ca93ce 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,9 @@
+= 4.1.1 (Unreleased) =
+
+Fixed an html5lib tree builder crash which happened when html5lib
+moved a tag with a multivalued attribute from one part of the tree to
+another. [bug=1019603]
+
= 4.1.0 (20120529) =
* Added experimental support for fixing Windows-1252 characters
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 4c22b86..dc7deb9 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -157,7 +157,16 @@ class TreeBuilder(object):
# value is a whitespace-separated list of CSS
# classes. Split it into a list.
value = attrs[cdata_list_attr]
- values = whitespace_re.split(value)
+ if isinstance(value, basestring):
+ values = whitespace_re.split(value)
+ else:
+ # html5lib sometimes calls setAttributes twice
+ # for the same tag when rearranging the parse
+ # tree. On the second call the attribute value
+ # here is already a list. If this happens,
+ # leave the value alone rather than trying to
+ # split it again.
+ values = value
attrs[cdata_list_attr] = values
return attrs
diff --git a/bs4/testing.py b/bs4/testing.py
index 5a84b0b..51f7e22 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -202,6 +202,14 @@ class HTMLTreeBuilderSmokeTest(object):
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+ def test_deeply_nested_multivalued_attribute(self):
+ # html5lib can set the attributes of the same tag many times
+ # as it rearranges the tree. This has caused problems with
+ # multivalued attributes.
+ markup = '<table><div><div class="css"></div></div></table>'
+ soup = self.soup(markup)
+ self.assertEqual(["css"], soup.div.div['class'])
+
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')