diff options
-rw-r--r-- | CHANGELOG | 7 | ||||
-rw-r--r-- | bs4/__init__.py | 52 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 35 |
3 files changed, 79 insertions, 15 deletions
@@ -1,3 +1,10 @@ += Unreleased + +* It's now possible to customize the TreeBuilder object by passing + keyword arguments into the BeautifulSoup constructor. The main + reason to do this right now is to change how multi-valued + attributes are treated. [bug=1832978] + = 4.7.1 (20190106) * Fixed a significant performance problem introduced in 4.7.0. [bug=1810617] diff --git a/bs4/__init__.py b/bs4/__init__.py index d3a1086..66797ca 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -98,8 +98,10 @@ class BeautifulSoup(Tag): name a specific parser, so that Beautiful Soup gives you the same results across platforms and virtual environments. - :param builder: A specific TreeBuilder to use instead of looking one - up based on `features`. You shouldn't need to use this. + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. :param parse_only: A SoupStrainer. Only parts of the document matching the SoupStrainer will be considered. This is useful @@ -118,11 +120,17 @@ class BeautifulSoup(Tag): :param kwargs: For backwards compatibility purposes, the constructor accepts certain keyword arguments used in Beautiful Soup 3. None of these arguments do anything in - Beautiful Soup 4 and there's no need to actually pass keyword - arguments into the constructor. + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder beyond saying + which one to use. + """ if 'convertEntities' in kwargs: + del kwargs['convertEntities'] warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " @@ -177,13 +185,17 @@ class BeautifulSoup(Tag): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None - if len(kwargs) > 0: - arg = kwargs.keys().pop() - raise TypeError( - "__init__() got an unexpected keyword argument '%s'" % arg) - - if builder is None: - original_features = features + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: @@ -194,9 +206,16 @@ class BeautifulSoup(Tag): "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) - builder = builder_class() - if not (original_features == builder.NAME or - original_features in builder.ALTERNATE_NAMES): + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ): if builder.is_xml: markup_type = "XML" else: @@ -231,7 +250,10 @@ class BeautifulSoup(Tag): markup_type=markup_type ) warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) - + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index f3e69ed..1c6b7a6 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -54,6 +54,41 @@ class TestConstructor(SoupTest): soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual("windows-1252", soup.original_encoding) + def test_custom_builder_class(self): + # Verify that you can pass in a custom Builder class and + # it'll be instantiated with the appropriate keyword arguments. + class Mock(object): + def __init__(self, **kwargs): + self.called_with = kwargs + self.is_xml = True + def initialize_soup(self, soup): + pass + def prepare_markup(self, *args, **kwargs): + return '' + + kwargs = dict( + var="value", + # This is a deprecated BS3-era keyword argument, which + # will be stripped out. + convertEntities=True, + ) + soup = BeautifulSoup('', builder=Mock, **kwargs) + assert isinstance(soup.builder, Mock) + self.assertEqual(dict(var="value"), soup.builder.called_with) + + # You can also instantiate the TreeBuilder yourself. In this + # case, that specific object is used and any keyword arguments + # to the BeautifulSoup constructor are ignored. + builder = Mock(**kwargs) + with warnings.catch_warnings(record=True) as w: + soup = BeautifulSoup( + '', builder=builder, ignored_value=True, + ) + msg = str(w[0].message) + assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.") + self.assertEqual(builder, soup.builder) + self.assertEqual(kwargs, builder.called_with) + class TestWarnings(SoupTest): |