summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG7
-rw-r--r--bs4/__init__.py52
-rw-r--r--bs4/tests/test_soup.py35
3 files changed, 79 insertions, 15 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d3c8578..ed44b3a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+= Unreleased
+
+* It's now possible to customize the TreeBuilder object by passing
+ keyword arguments into the BeautifulSoup constructor. The main
+ reason to do this right now is to change how multi-valued
+ attributes are treated. [bug=1832978]
+
= 4.7.1 (20190106)
* Fixed a significant performance problem introduced in 4.7.0. [bug=1810617]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index d3a1086..66797ca 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -98,8 +98,10 @@ class BeautifulSoup(Tag):
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
- :param builder: A specific TreeBuilder to use instead of looking one
- up based on `features`. You shouldn't need to use this.
+ :param builder: A TreeBuilder subclass to instantiate (or
+ instance to use) instead of looking one up based on
+ `features`. You only need to use this if you've implemented a
+ custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
@@ -118,11 +120,17 @@ class BeautifulSoup(Tag):
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
- Beautiful Soup 4 and there's no need to actually pass keyword
- arguments into the constructor.
+ Beautiful Soup 4; they will result in a warning and then be ignored.
+
+ Apart from this, any keyword arguments passed into the BeautifulSoup
+ constructor are propagated to the TreeBuilder constructor. This
+ makes it possible to configure a TreeBuilder beyond saying
+ which one to use.
+
"""
if 'convertEntities' in kwargs:
+ del kwargs['convertEntities']
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
@@ -177,13 +185,17 @@ class BeautifulSoup(Tag):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
- if len(kwargs) > 0:
- arg = kwargs.keys().pop()
- raise TypeError(
- "__init__() got an unexpected keyword argument '%s'" % arg)
-
- if builder is None:
- original_features = features
+ # We need this information to track whether or not the builder
+ # was specified well enough that we can omit the 'you need to
+ # specify a parser' warning.
+ original_builder = builder
+ original_features = features
+
+ if isinstance(builder, type):
+ # A builder class was passed in; it needs to be instantiated.
+ builder_class = builder
+ builder = None
+ elif builder is None:
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
@@ -194,9 +206,16 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
- builder = builder_class()
- if not (original_features == builder.NAME or
- original_features in builder.ALTERNATE_NAMES):
+
+ # At this point either we have a TreeBuilder instance in
+ # builder, or we have a builder_class that we can instantiate
+ # with the remaining **kwargs.
+ if builder is None:
+ builder = builder_class(**kwargs)
+ if not original_builder and not (
+ original_features == builder.NAME or
+ original_features in builder.ALTERNATE_NAMES
+ ):
if builder.is_xml:
markup_type = "XML"
else:
@@ -231,7 +250,10 @@ class BeautifulSoup(Tag):
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
-
+ else:
+ if kwargs:
+ warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
+
self.builder = builder
self.is_xml = builder.is_xml
self.known_xml = self.is_xml
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index f3e69ed..1c6b7a6 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -54,6 +54,41 @@ class TestConstructor(SoupTest):
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
+ def test_custom_builder_class(self):
+ # Verify that you can pass in a custom Builder class and
+ # it'll be instantiated with the appropriate keyword arguments.
+ class Mock(object):
+ def __init__(self, **kwargs):
+ self.called_with = kwargs
+ self.is_xml = True
+ def initialize_soup(self, soup):
+ pass
+ def prepare_markup(self, *args, **kwargs):
+ return ''
+
+ kwargs = dict(
+ var="value",
+ # This is a deprecated BS3-era keyword argument, which
+ # will be stripped out.
+ convertEntities=True,
+ )
+ soup = BeautifulSoup('', builder=Mock, **kwargs)
+ assert isinstance(soup.builder, Mock)
+ self.assertEqual(dict(var="value"), soup.builder.called_with)
+
+ # You can also instantiate the TreeBuilder yourself. In this
+ # case, that specific object is used and any keyword arguments
+ # to the BeautifulSoup constructor are ignored.
+ builder = Mock(**kwargs)
+ with warnings.catch_warnings(record=True) as w:
+ soup = BeautifulSoup(
+ '', builder=builder, ignored_value=True,
+ )
+ msg = str(w[0].message)
+ assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
+ self.assertEqual(builder, soup.builder)
+ self.assertEqual(kwargs, builder.called_with)
+
class TestWarnings(SoupTest):