12 files changed, 77 insertions, 28 deletions
diff --git a/CHANGELOG b/CHANGELOG
index ed44b3a..edf9648 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,7 +3,8 @@
 * It's now possible to customize the TreeBuilder object by passing
    keyword arguments into the BeautifulSoup constructor. The main
    reason to do this right now is to change how multi-valued
-   attributes are treated. [bug=1832978]
+   attributes are treated -- you can do this with the
+   'cdata_list_attributes' argument. [bug=1832978]
 
 = 4.7.1 (20190106)
 
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 4207750..9dad920 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -96,11 +96,15 @@ class TreeBuilder(object):
     
     # A value for these tag/attribute combinations is a space- or
     # comma-separated list of CDATA, rather than a single CDATA.
-    cdata_list_attributes = {}
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {}
 
-
-    def __init__(self):
+    USE_DEFAULT = object()
+    
+    def __init__(self, cdata_list_attributes=USE_DEFAULT):
         self.soup = None
+        if cdata_list_attributes is self.USE_DEFAULT:
+            cdata_list_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+        self.cdata_list_attributes = cdata_list_attributes
 
     def initialize_soup(self, soup):
         """The BeautifulSoup object has been initialized and is now
@@ -131,7 +135,7 @@ class TreeBuilder(object):
         if self.empty_element_tags is None:
             return True
         return tag_name in self.empty_element_tags
-        
+    
     def feed(self, markup):
         raise NotImplementedError()
 
@@ -259,7 +263,7 @@ class HTMLTreeBuilder(TreeBuilder):
     # encounter one of these attributes, we will parse its value into
     # a list of values if possible. Upon output, the list will be
     # converted back into a string.
-    cdata_list_attributes = {
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {
         "*" : ['class', 'accesskey', 'dropzone'],
         "a" : ['rel', 'rev'],
         "link" :  ['rel', 'rev'],
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 6fa8593..6892a93 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -199,7 +199,7 @@ class AttrList(object):
     def __setitem__(self, name, value):
         # If this attribute is a multi-valued attribute for this element,
         # turn its value into a list.
-        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        list_attr = self.element.cdata_list_attributes
         if (name in list_attr['*']
             or (self.element.name in list_attr
                 and name in list_attr[self.element.name])):
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index ff09ca3..56b8b91 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -214,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     NAME = HTMLPARSER
     features = [NAME, HTML, STRICT]
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+        parser_args = parser_args or []
+        parser_kwargs = parser_kwargs or {}
         if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
-            kwargs['strict'] = False
+            parser_kwargs['strict'] = False
         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
-            kwargs['convert_charrefs'] = False
-        self.parser_args = (args, kwargs)
+            parser_kwargs['convert_charrefs'] = False
+        self.parser_args = (parser_args, parser_kwargs)
 
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index b7e172c..27cadcb 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -94,7 +94,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             parser = parser(target=self, strip_cdata=False, encoding=encoding)
         return parser
 
-    def __init__(self, parser=None, empty_element_tags=None):
+    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
         # TODO: Issue a warning if parser is present but not a
         # callable, since that means there's no way to create new
         # parsers for different encodings.
@@ -103,6 +103,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
             self.empty_element_tags = set(empty_element_tags)
         self.soup = None
         self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
         
     def _getNsTag(self, tag):
         # Split the namespace URL out of a fully-qualified lxml tag
diff --git a/bs4/element.py b/bs4/element.py
index 547b8ba..1183f77 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -861,12 +861,27 @@ class Tag(PageElement):
         self.setup(parent, previous)
         self.hidden = False
 
-        # Set up any substitutions, such as the charset in a META tag.
-        if builder is not None:
+        if builder is None:
+            # In the absence of a TreeBuilder, assume this tag is nothing
+            # special.
+            self.can_be_empty_element = False
+            self.cdata_list_attributes = None
+        else:
+            # Set up any substitutions for this tag, such as the charset in a META tag.
             builder.set_up_substitutions(self)
+
+            # Ask the TreeBuilder whether this tag might be an empty-element tag.
             self.can_be_empty_element = builder.can_be_empty_element(name)
-        else:
-            self.can_be_empty_element = False
+
+            # Keep track of the list of attributes of this tag that
+            # might need to be treated as a list.
+            #
+            # For performance reasons, we store the whole data structure
+            # rather than asking the question of every tag. Asking would
+            # require building a new data structure every time, and
+            # (unlike can_be_empty_element), we almost never need
+            # to check this.
+            self.cdata_list_attributes = builder.cdata_list_attributes
             
     parserClass = _alias("parser_class")  # BS3
 
diff --git a/bs4/testing.py b/bs4/testing.py
index e4a0ffe..e144e7e 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -63,19 +63,19 @@ class SoupTest(unittest.TestCase):
 
     @property
     def default_builder(self):
-        return default_builder()
+        return default_builder
 
     def soup(self, markup, **kwargs):
         """Build a Beautiful Soup object from markup."""
         builder = kwargs.pop('builder', self.default_builder)
         return BeautifulSoup(markup, builder=builder, **kwargs)
 
-    def document_for(self, markup):
+    def document_for(self, markup, **kwargs):
         """Turn an HTML fragment into a document.
 
         The details depend on the builder.
         """
-        return self.default_builder.test_fragment_to_document(markup)
+        return self.default_builder(**kwargs).test_fragment_to_document(markup)
 
     def assertSoupEquals(self, to_parse, compare_parsed_to=None):
         builder = self.default_builder
@@ -232,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object):
             soup = self.soup("")
             new_tag = soup.new_tag(name)
             self.assertEqual(True, new_tag.is_empty_element)
-    
+
     def test_pickle_and_unpickle_identity(self):
         # Pickling a tree, then unpickling it, yields a tree identical
         # to the original.
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 3a04787..371463a 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
 
     @property
     def default_builder(self):
-        return HTML5TreeBuilder()
+        return HTML5TreeBuilder
 
     def test_soupstrainer(self):
         # The html5lib tree builder does not support SoupStrainers.
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 0381c7d..790489a 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -9,9 +9,7 @@ from bs4.builder._htmlparser import BeautifulSoupHTMLParser
 
 class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
 
-    @property
-    def default_builder(self):
-        return HTMLParserTreeBuilder()
+    default_builder = HTMLParserTreeBuilder
 
     def test_namespaced_system_doctype(self):
         # html.parser can't handle namespaced doctypes, so skip this one.
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 1a4f27c..3b7858f 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
 
     @property
     def default_builder(self):
-        return LXMLTreeBuilder()
+        return LXMLTreeBuilder
 
     def test_out_of_range_entity(self):
         self.assertSoupEquals(
@@ -79,7 +79,7 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
 
     @property
     def default_builder(self):
-        return LXMLTreeBuilderForXML()
+        return LXMLTreeBuilderForXML
 
     def test_namespace_indexing(self):
         # We should not track un-prefixed namespaces as we can only hold one
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 1c6b7a6..213255d 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -24,6 +24,7 @@ from bs4.dammit import (
     EncodingDetector,
 )
 from bs4.testing import (
+    default_builder,
     SoupTest,
     skipIf,
 )
@@ -89,7 +90,33 @@ class TestConstructor(SoupTest):
         self.assertEqual(builder, soup.builder)
         self.assertEqual(kwargs, builder.called_with)
 
-
+    def test_cdata_list_attributes(self):
+        # Most attribute values are represented as scalars, but the
+        # HTML standard says that some attributes, like 'class' have
+        # space-separated lists as values.
+        markup = '<a id=" an id " class=" a class "></a>'
+        soup = self.soup(markup)
+
+        # Note that the spaces are stripped for 'class' but not for 'id'.
+        a = soup.a
+        self.assertEqual(" an id ", a['id'])
+        self.assertEqual(["a", "class"], a['class'])
+
+        # TreeBuilder takes an argument called 'cdata_list_attributes'  which lets
+        # you customize or disable this. As always, you can customize the TreeBuilder
+        # by passing in a keyword argument to the BeautifulSoup constructor.
+        soup = self.soup(markup, builder=default_builder, cdata_list_attributes=None)
+        self.assertEqual(" a class ", soup.a['class'])
+
+        # Here are two ways of saying that `id` is a CDATA list
+        # attribute and 'class' is not.
+        for switcheroo in ({'*': 'id'}, {'a': 'id'}):
+            soup = self.soup(markup, builder=None, cdata_list_attributes=switcheroo)
+            a = soup.a
+            self.assertEqual(["an", "id"], a['id'])
+            self.assertEqual(" a class ", a['class'])
+
+        
 class TestWarnings(SoupTest):
 
     def _no_parser_specified(self, s, is_there=True):
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 6d79454..a14928e 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -896,7 +896,7 @@ class TestTreeModification(SoupTest):
         self.assertEqual(soup.a.contents[0].next_element, "bar")
 
     def test_insert_tag(self):
-        builder = self.default_builder
+        builder = self.default_builder()
         soup = self.soup(
             "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
         magic_tag = Tag(soup, builder, 'magictag')