summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-08-26 18:36:43 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-08-26 18:36:43 -0400
commitcf028c24cfa8b8b4787aea50ad73cc8b18f15770 (patch)
tree6183099fe99471e1e0bf866f479f9732e3b1bf27
parent344ee4494196eb41ee049761193cdad529ee59de (diff)
It's now possible to override any of the element classes.
-rw-r--r--bs4/__init__.py30
-rw-r--r--bs4/tests/test_soup.py23
2 files changed, 38 insertions, 15 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 4ec4785..e27ca6f 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -85,7 +85,7 @@ class BeautifulSoup(Tag):
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
- tag_class=Tag, string_class=NavigableString, **kwargs):
+ element_classes=None, **kwargs):
"""Constructor.
:param markup: A string or a file-like object representing
@@ -117,6 +117,12 @@ class BeautifulSoup(Tag):
the document's encoding but you know Beautiful Soup's guess is
wrong.
+ :param element_classes: A dictionary mapping BeautifulSoup
+ classes like Tag and NavigableString to other classes you'd
+ like to be instantiated instead as the parse tree is
+ built. This is useful for using subclasses to modify the
+ default behavior of Tag or NavigableString.
+
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
@@ -185,8 +191,7 @@ class BeautifulSoup(Tag):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
- self.tag_class = tag_class
- self.string_class = string_class
+ self.element_classes = element_classes or dict()
# We need this information to track whether or not the builder
# was specified well enough that we can omit the 'you need to
@@ -384,14 +389,16 @@ class BeautifulSoup(Tag):
sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup."""
kwattrs.update(attrs)
- return self.tag_class(
+ return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs,
sourceline=sourceline, sourcepos=sourcepos
)
def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this soup."""
- subclass = subclass or self.string_class
+ subclass = subclass or self.element_classes.get(
+ NavigableString, NavigableString
+ )
return subclass(s)
def insert_before(self, successor):
@@ -419,7 +426,16 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=None):
- containerClass = containerClass or self.string_class
+
+ # Default container is NavigableString.
+ containerClass = containerClass or NavigableString
+
+ # The user may want us to instantiate some alias for the
+ # container class.
+ containerClass = self.element_classes.get(
+ containerClass, containerClass
+ )
+
if self.current_data:
current_data = u''.join(self.current_data)
# If whitespace is not preserved, and this string contains
@@ -558,7 +574,7 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))):
return None
- tag = self.tag_class(
+ tag = self.element_classes.get(Tag, Tag)(
self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element,
sourceline=sourceline, sourcepos=sourcepos
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 07f29c4..af5f791 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -13,6 +13,7 @@ from bs4 import (
)
from bs4.element import (
CharsetMetaAttributeValue,
+ Comment,
ContentMetaAttributeValue,
SoupStrainer,
NamespacedAttribute,
@@ -123,24 +124,30 @@ class TestConstructor(SoupTest):
self.assertEqual(" a class ", a['class'])
def test_replacement_classes(self):
- # Test the ability to pass in replacements for the Tag and
- # NavigableString class, which will be used when building
- # the tree.
+ # Test the ability to pass in replacements for element classes
+ # which will be used when building the tree.
class TagPlus(Tag):
pass
class StringPlus(NavigableString):
pass
+ class CommentPlus(Comment):
+ pass
+
soup = self.soup(
- "<a><b>foo</b>bar</a>",
- tag_class=TagPlus, string_class=StringPlus
+ "<a><b>foo</b>bar</a><!--whee-->",
+ element_classes = {
+ Tag: TagPlus,
+ NavigableString: StringPlus,
+ Comment: CommentPlus,
+ }
)
- # The tree was built with TagPlus and StringPlus objects,
- # rather than Tag and String objects.
+ # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
+ # rather than Tag, String, and Comment objects.
assert all(
- isinstance(x, (TagPlus, StringPlus))
+ isinstance(x, (TagPlus, StringPlus, CommentPlus))
for x in soup.recursiveChildGenerator()
)