diff options
-rw-r--r-- | CHANGELOG | 3 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 14 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 11 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 3 | ||||
-rw-r--r-- | bs4/element.py | 23 | ||||
-rw-r--r-- | bs4/testing.py | 8 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 4 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 4 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 29 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 2 |
12 files changed, 77 insertions, 28 deletions
@@ -3,7 +3,8 @@ * It's now possible to customize the TreeBuilder object by passing keyword arguments into the BeautifulSoup constructor. The main reason to do this right now is to change how multi-valued - attributes are treated. [bug=1832978] + attributes are treated -- you can do this with the + 'cdata_list_attributes' argument. [bug=1832978] = 4.7.1 (20190106) diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 4207750..9dad920 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -96,11 +96,15 @@ class TreeBuilder(object): # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. - cdata_list_attributes = {} + DEFAULT_CDATA_LIST_ATTRIBUTES = {} - - def __init__(self): + USE_DEFAULT = object() + + def __init__(self, cdata_list_attributes=USE_DEFAULT): self.soup = None + if cdata_list_attributes is self.USE_DEFAULT: + cdata_list_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = cdata_list_attributes def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now @@ -131,7 +135,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -259,7 +263,7 @@ class HTMLTreeBuilder(TreeBuilder): # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. - cdata_list_attributes = { + DEFAULT_CDATA_LIST_ATTRIBUTES = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 6fa8593..6892a93 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -199,7 +199,7 @@ class AttrList(object): def __setitem__(self, name, value): # If this attribute is a multi-valued attribute for this element, # turn its value into a list. - list_attr = HTML5TreeBuilder.cdata_list_attributes + list_attr = self.element.cdata_list_attributes if (name in list_attr['*'] or (self.element.name in list_attr and name in list_attr[self.element.name])): diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ff09ca3..56b8b91 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -214,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): NAME = HTMLPARSER features = [NAME, HTML, STRICT] - def __init__(self, *args, **kwargs): + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - kwargs['strict'] = False + parser_kwargs['strict'] = False if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - kwargs['convert_charrefs'] = False - self.parser_args = (args, kwargs) + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index b7e172c..27cadcb 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -94,7 +94,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser - def __init__(self, parser=None, empty_element_tags=None): + def __init__(self, parser=None, empty_element_tags=None, **kwargs): # TODO: Issue a warning if parser is present but not a # callable, since that means there's no way to create new # parsers for different encodings. @@ -103,6 +103,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.empty_element_tags = set(empty_element_tags) self.soup = None self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag diff --git a/bs4/element.py b/bs4/element.py index 547b8ba..1183f77 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -861,12 +861,27 @@ class Tag(PageElement): self.setup(parent, previous) self.hidden = False - # Set up any substitutions, such as the charset in a META tag. - if builder is not None: + if builder is None: + # In the absence of a TreeBuilder, assume this tag is nothing + # special. + self.can_be_empty_element = False + self.cdata_list_attributes = None + else: + # Set up any substitutions for this tag, such as the charset in a META tag. builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. self.can_be_empty_element = builder.can_be_empty_element(name) - else: - self.can_be_empty_element = False + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes parserClass = _alias("parser_class") # BS3 diff --git a/bs4/testing.py b/bs4/testing.py index e4a0ffe..e144e7e 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -63,19 +63,19 @@ class SoupTest(unittest.TestCase): @property def default_builder(self): - return default_builder() + return default_builder def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return BeautifulSoup(markup, builder=builder, **kwargs) - def document_for(self, markup): + def document_for(self, markup, **kwargs): """Turn an HTML fragment into a document. The details depend on the builder. """ - return self.default_builder.test_fragment_to_document(markup) + return self.default_builder(**kwargs).test_fragment_to_document(markup) def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder @@ -232,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object): soup = self.soup("") new_tag = soup.new_tag(name) self.assertEqual(True, new_tag.is_empty_element) - + def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 3a04787..371463a 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): @property def default_builder(self): - return HTML5TreeBuilder() + return HTML5TreeBuilder def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 0381c7d..790489a 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -9,9 +9,7 @@ from bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - @property - def default_builder(self): - return HTMLParserTreeBuilder() + default_builder = HTMLParserTreeBuilder def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 1a4f27c..3b7858f 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): @property def default_builder(self): - return LXMLTreeBuilder() + return LXMLTreeBuilder def test_out_of_range_entity(self): self.assertSoupEquals( @@ -79,7 +79,7 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): @property def default_builder(self): - return LXMLTreeBuilderForXML() + return LXMLTreeBuilderForXML def test_namespace_indexing(self): # We should not track un-prefixed namespaces as we can only hold one diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 1c6b7a6..213255d 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -24,6 +24,7 @@ from bs4.dammit import ( EncodingDetector, ) from bs4.testing import ( + default_builder, SoupTest, skipIf, ) @@ -89,7 +90,33 @@ class TestConstructor(SoupTest): self.assertEqual(builder, soup.builder) self.assertEqual(kwargs, builder.called_with) - + def test_cdata_list_attributes(self): + # Most attribute values are represented as scalars, but the + # HTML standard says that some attributes, like 'class' have + # space-separated lists as values. + markup = '<a id=" an id " class=" a class "></a>' + soup = self.soup(markup) + + # Note that the spaces are stripped for 'class' but not for 'id'. + a = soup.a + self.assertEqual(" an id ", a['id']) + self.assertEqual(["a", "class"], a['class']) + + # TreeBuilder takes an argument called 'cdata_list_attributes' which lets + # you customize or disable this. As always, you can customize the TreeBuilder + # by passing in a keyword argument to the BeautifulSoup constructor. + soup = self.soup(markup, builder=default_builder, cdata_list_attributes=None) + self.assertEqual(" a class ", soup.a['class']) + + # Here are two ways of saying that `id` is a CDATA list + # attribute and 'class' is not. + for switcheroo in ({'*': 'id'}, {'a': 'id'}): + soup = self.soup(markup, builder=None, cdata_list_attributes=switcheroo) + a = soup.a + self.assertEqual(["an", "id"], a['id']) + self.assertEqual(" a class ", a['class']) + + class TestWarnings(SoupTest): def _no_parser_specified(self, s, is_there=True): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 6d79454..a14928e 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -896,7 +896,7 @@ class TestTreeModification(SoupTest): self.assertEqual(soup.a.contents[0].next_element, "bar") def test_insert_tag(self): - builder = self.default_builder + builder = self.default_builder() soup = self.soup( "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) magic_tag = Tag(soup, builder, 'magictag') |