diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 20 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 15 | ||||
-rw-r--r-- | bs4/element.py | 16 | ||||
-rw-r--r-- | bs4/testing.py | 5 |
5 files changed, 43 insertions, 16 deletions
@@ -7,6 +7,9 @@ definitions ending with two question marks instead of one. [bug=984258] +* The test suite now passes when lxml is not installed, whether or not + html5lib is installed. [bug=987004] + * Print a warning on HTMLParseErrors to let people know they should install a better parser library. diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 9f4f59e..4c22b86 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -1,8 +1,10 @@ from collections import defaultdict +import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + whitespace_re ) __all__ = [ @@ -140,6 +142,24 @@ class TreeBuilder(object): def set_up_substitutions(self, tag): return False + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), []) + for cdata_list_attr in itertools.chain(universal, tag_specific): + if cdata_list_attr in dict(attrs): + # Basically, we have a "class" attribute whose + # value is a whitespace-separated list of CSS + # classes. Split it into a list. + value = attrs[cdata_list_attr] + values = whitespace_re.split(value) + attrs[cdata_list_attr] = values + return attrs class SAXTreeBuilder(TreeBuilder): """A Beautiful Soup treebuilder that listens for SAX events.""" diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 2d6fe31..6001e38 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -139,11 +139,20 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): - if attributes is not None and attributes != {}: + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] for name, value in list(attributes.items()): if isinstance(name, tuple): - name = NamespacedAttribute(*name) - self.element[name] = value + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in attributes.items(): + self.element[name] = value + # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. # diff --git a/bs4/element.py b/bs4/element.py index aa9a3e9..282193e 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,5 +1,4 @@ import collections -import itertools import re import sys import warnings @@ -735,20 +734,11 @@ class Tag(PageElement): self.prefix = prefix if attrs is None: attrs = {} + elif builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) else: attrs = dict(attrs) - if builder.cdata_list_attributes: - universal = builder.cdata_list_attributes.get('*', []) - tag_specific = builder.cdata_list_attributes.get( - self.name.lower(), []) - for cdata_list_attr in itertools.chain(universal, tag_specific): - if cdata_list_attr in attrs: - # Basically, we have a "class" attribute whose - # value is a whitespace-separated list of CSS - # classes. Split it into a list. - value = attrs[cdata_list_attr] - values = whitespace_re.split(value) - attrs[cdata_list_attr] = values self.attrs = attrs self.contents = [] self.setup(parent, previous) diff --git a/bs4/testing.py b/bs4/testing.py index 94c87c9..b004c18 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -238,6 +238,11 @@ class HTMLTreeBuilderSmokeTest(object): self.assertEqual( 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + def test_multivalued_attribute_value_becomes_list(self): + markup = b'<a class="foo bar">' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are |