diff options
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 11 | ||||
-rw-r--r-- | bs4/testing.py | 8 |
3 files changed, 24 insertions, 1 deletions
@@ -1,3 +1,9 @@ += 4.1.1 (Unreleased) = + +Fixed an html5lib tree builder crash which happened when html5lib +moved a tag with a multivalued attribute from one part of the tree to +another. [bug=1019603] + = 4.1.0 (20120529) = * Added experimental support for fixing Windows-1252 characters diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 4c22b86..dc7deb9 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -157,7 +157,16 @@ class TreeBuilder(object): # value is a whitespace-separated list of CSS # classes. Split it into a list. value = attrs[cdata_list_attr] - values = whitespace_re.split(value) + if isinstance(value, basestring): + values = whitespace_re.split(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value attrs[cdata_list_attr] = values return attrs diff --git a/bs4/testing.py b/bs4/testing.py index 5a84b0b..51f7e22 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -202,6 +202,14 @@ class HTMLTreeBuilderSmokeTest(object): "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '<table><div><div class="css"></div></div></table>' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') |