diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-07-07 14:01:40 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-07-07 14:01:40 -0400 |
commit | 0c3c1970dcb93bbe591707e43cfba9b24de45d05 (patch) | |
tree | 48be5f0ef368309d401cee00bd0f2d504fdde85f /bs4/__init__.py | |
parent | 66d2597c36b1923c487daae55bfa73e4bb4a66d1 (diff) |
It's now possible to customize the TreeBuilder object by passing
keyword arguments into the BeautifulSoup constructor. The main
reason to do this right now is to change how multi-valued
attributes are treated. [bug=1832978]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r-- | bs4/__init__.py | 52 |
1 files changed, 37 insertions, 15 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index d3a1086..66797ca 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -98,8 +98,10 @@ class BeautifulSoup(Tag): name a specific parser, so that Beautiful Soup gives you the same results across platforms and virtual environments. - :param builder: A specific TreeBuilder to use instead of looking one - up based on `features`. You shouldn't need to use this. + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. :param parse_only: A SoupStrainer. Only parts of the document matching the SoupStrainer will be considered. This is useful @@ -118,11 +120,17 @@ class BeautifulSoup(Tag): :param kwargs: For backwards compatibility purposes, the constructor accepts certain keyword arguments used in Beautiful Soup 3. None of these arguments do anything in - Beautiful Soup 4 and there's no need to actually pass keyword - arguments into the constructor. + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder beyond saying + which one to use. + """ if 'convertEntities' in kwargs: + del kwargs['convertEntities'] warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " @@ -177,13 +185,17 @@ class BeautifulSoup(Tag): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None - if len(kwargs) > 0: - arg = kwargs.keys().pop() - raise TypeError( - "__init__() got an unexpected keyword argument '%s'" % arg) - - if builder is None: - original_features = features + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: @@ -194,9 +206,16 @@ class BeautifulSoup(Tag): "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) - builder = builder_class() - if not (original_features == builder.NAME or - original_features in builder.ALTERNATE_NAMES): + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ): if builder.is_xml: markup_type = "XML" else: @@ -231,7 +250,10 @@ class BeautifulSoup(Tag): markup_type=markup_type ) warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) - + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml |