summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-07-07 14:01:40 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-07-07 14:01:40 -0400
commit0c3c1970dcb93bbe591707e43cfba9b24de45d05 (patch)
tree48be5f0ef368309d401cee00bd0f2d504fdde85f
parent66d2597c36b1923c487daae55bfa73e4bb4a66d1 (diff)
It's now possible to customize the TreeBuilder object by passing
keyword arguments into the BeautifulSoup constructor. The main reason to do this right now is to change how multi-valued attributes are treated. [bug=1832978]
-rw-r--r--CHANGELOG7
-rw-r--r--bs4/__init__.py52
-rw-r--r--bs4/tests/test_soup.py35
3 files changed, 79 insertions, 15 deletions
diff --git a/CHANGELOG b/CHANGELOG
index d3c8578..ed44b3a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+= Unreleased
+
+* It's now possible to customize the TreeBuilder object by passing
+ keyword arguments into the BeautifulSoup constructor. The main
+ reason to do this right now is to change how multi-valued
+ attributes are treated. [bug=1832978]
+
= 4.7.1 (20190106)
* Fixed a significant performance problem introduced in 4.7.0. [bug=1810617]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index d3a1086..66797ca 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -98,8 +98,10 @@ class BeautifulSoup(Tag):
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
- :param builder: A specific TreeBuilder to use instead of looking one
- up based on `features`. You shouldn't need to use this.
+ :param builder: A TreeBuilder subclass to instantiate (or
+ instance to use) instead of looking one up based on
+ `features`. You only need to use this if you've implemented a
+ custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
@@ -118,11 +120,17 @@ class BeautifulSoup(Tag):
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
- Beautiful Soup 4 and there's no need to actually pass keyword
- arguments into the constructor.
+ Beautiful Soup 4; they will result in a warning and then be ignored.
+
+ Apart from this, any keyword arguments passed into the BeautifulSoup
+ constructor are propagated to the TreeBuilder constructor. This
+ makes it possible to configure a TreeBuilder beyond saying
+ which one to use.
+
"""
if 'convertEntities' in kwargs:
+ del kwargs['convertEntities']
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
@@ -177,13 +185,17 @@ class BeautifulSoup(Tag):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
- if len(kwargs) > 0:
- arg = kwargs.keys().pop()
- raise TypeError(
- "__init__() got an unexpected keyword argument '%s'" % arg)
-
- if builder is None:
- original_features = features
+ # We need this information to track whether or not the builder
+ # was specified well enough that we can omit the 'you need to
+ # specify a parser' warning.
+ original_builder = builder
+ original_features = features
+
+ if isinstance(builder, type):
+ # A builder class was passed in; it needs to be instantiated.
+ builder_class = builder
+ builder = None
+ elif builder is None:
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
@@ -194,9 +206,16 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
- builder = builder_class()
- if not (original_features == builder.NAME or
- original_features in builder.ALTERNATE_NAMES):
+
+ # At this point either we have a TreeBuilder instance in
+ # builder, or we have a builder_class that we can instantiate
+ # with the remaining **kwargs.
+ if builder is None:
+ builder = builder_class(**kwargs)
+ if not original_builder and not (
+ original_features == builder.NAME or
+ original_features in builder.ALTERNATE_NAMES
+ ):
if builder.is_xml:
markup_type = "XML"
else:
@@ -231,7 +250,10 @@ class BeautifulSoup(Tag):
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
-
+ else:
+ if kwargs:
+ warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
+
self.builder = builder
self.is_xml = builder.is_xml
self.known_xml = self.is_xml
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index f3e69ed..1c6b7a6 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -54,6 +54,41 @@ class TestConstructor(SoupTest):
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
+ def test_custom_builder_class(self):
+ # Verify that you can pass in a custom Builder class and
+ # it'll be instantiated with the appropriate keyword arguments.
+ class Mock(object):
+ def __init__(self, **kwargs):
+ self.called_with = kwargs
+ self.is_xml = True
+ def initialize_soup(self, soup):
+ pass
+ def prepare_markup(self, *args, **kwargs):
+ return ''
+
+ kwargs = dict(
+ var="value",
+ # This is a deprecated BS3-era keyword argument, which
+ # will be stripped out.
+ convertEntities=True,
+ )
+ soup = BeautifulSoup('', builder=Mock, **kwargs)
+ assert isinstance(soup.builder, Mock)
+ self.assertEqual(dict(var="value"), soup.builder.called_with)
+
+ # You can also instantiate the TreeBuilder yourself. In this
+ # case, that specific object is used and any keyword arguments
+ # to the BeautifulSoup constructor are ignored.
+ builder = Mock(**kwargs)
+ with warnings.catch_warnings(record=True) as w:
+ soup = BeautifulSoup(
+ '', builder=builder, ignored_value=True,
+ )
+ msg = str(w[0].message)
+ assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
+ self.assertEqual(builder, soup.builder)
+ self.assertEqual(kwargs, builder.called_with)
+
class TestWarnings(SoupTest):