summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG3
-rw-r--r--bs4/builder/__init__.py14
-rw-r--r--bs4/builder/_html5lib.py2
-rw-r--r--bs4/builder/_htmlparser.py11
-rw-r--r--bs4/builder/_lxml.py3
-rw-r--r--bs4/element.py23
-rw-r--r--bs4/testing.py8
-rw-r--r--bs4/tests/test_html5lib.py2
-rw-r--r--bs4/tests/test_htmlparser.py4
-rw-r--r--bs4/tests/test_lxml.py4
-rw-r--r--bs4/tests/test_soup.py29
-rw-r--r--bs4/tests/test_tree.py2
12 files changed, 77 insertions, 28 deletions
diff --git a/CHANGELOG b/CHANGELOG
index ed44b3a..edf9648 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,7 +3,8 @@
* It's now possible to customize the TreeBuilder object by passing
keyword arguments into the BeautifulSoup constructor. The main
reason to do this right now is to change how multi-valued
- attributes are treated. [bug=1832978]
+ attributes are treated -- you can do this with the
+ 'cdata_list_attributes' argument. [bug=1832978]
= 4.7.1 (20190106)
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 4207750..9dad920 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -96,11 +96,15 @@ class TreeBuilder(object):
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
- cdata_list_attributes = {}
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {}
-
- def __init__(self):
+ USE_DEFAULT = object()
+
+ def __init__(self, cdata_list_attributes=USE_DEFAULT):
self.soup = None
+ if cdata_list_attributes is self.USE_DEFAULT:
+ cdata_list_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+ self.cdata_list_attributes = cdata_list_attributes
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
@@ -131,7 +135,7 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
+
def feed(self, markup):
raise NotImplementedError()
@@ -259,7 +263,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
- cdata_list_attributes = {
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 6fa8593..6892a93 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -199,7 +199,7 @@ class AttrList(object):
def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
- list_attr = HTML5TreeBuilder.cdata_list_attributes
+ list_attr = self.element.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ff09ca3..56b8b91 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -214,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
- def __init__(self, *args, **kwargs):
+ def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+ super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+ parser_args = parser_args or []
+ parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
- kwargs['strict'] = False
+ parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
- kwargs['convert_charrefs'] = False
- self.parser_args = (args, kwargs)
+ parser_kwargs['convert_charrefs'] = False
+ self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index b7e172c..27cadcb 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -94,7 +94,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
- def __init__(self, parser=None, empty_element_tags=None):
+ def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
@@ -103,6 +103,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.empty_element_tags = set(empty_element_tags)
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+ super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
diff --git a/bs4/element.py b/bs4/element.py
index 547b8ba..1183f77 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -861,12 +861,27 @@ class Tag(PageElement):
self.setup(parent, previous)
self.hidden = False
- # Set up any substitutions, such as the charset in a META tag.
- if builder is not None:
+ if builder is None:
+ # In the absence of a TreeBuilder, assume this tag is nothing
+ # special.
+ self.can_be_empty_element = False
+ self.cdata_list_attributes = None
+ else:
+ # Set up any substitutions for this tag, such as the charset in a META tag.
builder.set_up_substitutions(self)
+
+ # Ask the TreeBuilder whether this tag might be an empty-element tag.
self.can_be_empty_element = builder.can_be_empty_element(name)
- else:
- self.can_be_empty_element = False
+
+ # Keep track of the list of attributes of this tag that
+ # might need to be treated as a list.
+ #
+ # For performance reasons, we store the whole data structure
+ # rather than asking the question of every tag. Asking would
+ # require building a new data structure every time, and
+ # (unlike can_be_empty_element), we almost never need
+ # to check this.
+ self.cdata_list_attributes = builder.cdata_list_attributes
parserClass = _alias("parser_class") # BS3
diff --git a/bs4/testing.py b/bs4/testing.py
index e4a0ffe..e144e7e 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -63,19 +63,19 @@ class SoupTest(unittest.TestCase):
@property
def default_builder(self):
- return default_builder()
+ return default_builder
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
- def document_for(self, markup):
+ def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
- return self.default_builder.test_fragment_to_document(markup)
+ return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
@@ -232,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup("")
new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element)
-
+
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 3a04787..371463a 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
@property
def default_builder(self):
- return HTML5TreeBuilder()
+ return HTML5TreeBuilder
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 0381c7d..790489a 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -9,9 +9,7 @@ from bs4.builder._htmlparser import BeautifulSoupHTMLParser
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
- @property
- def default_builder(self):
- return HTMLParserTreeBuilder()
+ default_builder = HTMLParserTreeBuilder
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 1a4f27c..3b7858f 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
- return LXMLTreeBuilder()
+ return LXMLTreeBuilder
def test_out_of_range_entity(self):
self.assertSoupEquals(
@@ -79,7 +79,7 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
@property
def default_builder(self):
- return LXMLTreeBuilderForXML()
+ return LXMLTreeBuilderForXML
def test_namespace_indexing(self):
# We should not track un-prefixed namespaces as we can only hold one
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 1c6b7a6..213255d 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -24,6 +24,7 @@ from bs4.dammit import (
EncodingDetector,
)
from bs4.testing import (
+ default_builder,
SoupTest,
skipIf,
)
@@ -89,7 +90,33 @@ class TestConstructor(SoupTest):
self.assertEqual(builder, soup.builder)
self.assertEqual(kwargs, builder.called_with)
-
+ def test_cdata_list_attributes(self):
+ # Most attribute values are represented as scalars, but the
+ # HTML standard says that some attributes, like 'class' have
+ # space-separated lists as values.
+ markup = '<a id=" an id " class=" a class "></a>'
+ soup = self.soup(markup)
+
+ # Note that the spaces are stripped for 'class' but not for 'id'.
+ a = soup.a
+ self.assertEqual(" an id ", a['id'])
+ self.assertEqual(["a", "class"], a['class'])
+
+ # TreeBuilder takes an argument called 'cdata_list_attributes' which lets
+ # you customize or disable this. As always, you can customize the TreeBuilder
+ # by passing in a keyword argument to the BeautifulSoup constructor.
+ soup = self.soup(markup, builder=default_builder, cdata_list_attributes=None)
+ self.assertEqual(" a class ", soup.a['class'])
+
+ # Here are two ways of saying that `id` is a CDATA list
+ # attribute and 'class' is not.
+ for switcheroo in ({'*': 'id'}, {'a': 'id'}):
+ soup = self.soup(markup, builder=None, cdata_list_attributes=switcheroo)
+ a = soup.a
+ self.assertEqual(["an", "id"], a['id'])
+ self.assertEqual(" a class ", a['class'])
+
+
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 6d79454..a14928e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -896,7 +896,7 @@ class TestTreeModification(SoupTest):
self.assertEqual(soup.a.contents[0].next_element, "bar")
def test_insert_tag(self):
- builder = self.default_builder
+ builder = self.default_builder()
soup = self.soup(
"<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
magic_tag = Tag(soup, builder, 'magictag')