diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-08-26 18:36:43 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-08-26 18:36:43 -0400 |
commit | cf028c24cfa8b8b4787aea50ad73cc8b18f15770 (patch) | |
tree | 6183099fe99471e1e0bf866f479f9732e3b1bf27 | |
parent | 344ee4494196eb41ee049761193cdad529ee59de (diff) |
It's now possible to override any of the element classes.
-rw-r--r-- | bs4/__init__.py | 30 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 23 |
2 files changed, 38 insertions, 15 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 4ec4785..e27ca6f 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -85,7 +85,7 @@ class BeautifulSoup(Tag): def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, - tag_class=Tag, string_class=NavigableString, **kwargs): + element_classes=None, **kwargs): """Constructor. :param markup: A string or a file-like object representing @@ -117,6 +117,12 @@ class BeautifulSoup(Tag): the document's encoding but you know Beautiful Soup's guess is wrong. + :param element_classes: A dictionary mapping BeautifulSoup + classes like Tag and NavigableString to other classes you'd + like to be instantiated instead as the parse tree is + built. This is useful for using subclasses to modify the + default behavior of Tag or NavigableString. + :param kwargs: For backwards compatibility purposes, the constructor accepts certain keyword arguments used in Beautiful Soup 3. None of these arguments do anything in @@ -185,8 +191,7 @@ class BeautifulSoup(Tag): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None - self.tag_class = tag_class - self.string_class = string_class + self.element_classes = element_classes or dict() # We need this information to track whether or not the builder # was specified well enough that we can omit the 'you need to @@ -384,14 +389,16 @@ class BeautifulSoup(Tag): sourceline=None, sourcepos=None, **kwattrs): """Create a new tag associated with this soup.""" kwattrs.update(attrs) - return self.tag_class( + return self.element_classes.get(Tag, Tag)( None, self.builder, name, namespace, nsprefix, kwattrs, sourceline=sourceline, sourcepos=sourcepos ) def new_string(self, s, subclass=None): """Create a new NavigableString associated with this soup.""" - subclass = subclass or self.string_class + subclass = subclass or self.element_classes.get( + NavigableString, NavigableString + ) return subclass(s) def insert_before(self, successor): @@ -419,7 +426,16 @@ class BeautifulSoup(Tag): self.preserve_whitespace_tag_stack.append(tag) def endData(self, containerClass=None): - containerClass = containerClass or self.string_class + + # Default container is NavigableString. + containerClass = containerClass or NavigableString + + # The user may want us to instantiate some alias for the + # container class. + containerClass = self.element_classes.get( + containerClass, containerClass + ) + if self.current_data: current_data = u''.join(self.current_data) # If whitespace is not preserved, and this string contains @@ -558,7 +574,7 @@ class BeautifulSoup(Tag): or not self.parse_only.search_tag(name, attrs))): return None - tag = self.tag_class( + tag = self.element_classes.get(Tag, Tag)( self, self.builder, name, namespace, nsprefix, attrs, self.currentTag, self._most_recent_element, sourceline=sourceline, sourcepos=sourcepos diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 07f29c4..af5f791 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -13,6 +13,7 @@ from bs4 import ( ) from bs4.element import ( CharsetMetaAttributeValue, + Comment, ContentMetaAttributeValue, SoupStrainer, NamespacedAttribute, @@ -123,24 +124,30 @@ class TestConstructor(SoupTest): self.assertEqual(" a class ", a['class']) def test_replacement_classes(self): - # Test the ability to pass in replacements for the Tag and - # NavigableString class, which will be used when building - # the tree. + # Test the ability to pass in replacements for element classes + # which will be used when building the tree. class TagPlus(Tag): pass class StringPlus(NavigableString): pass + class CommentPlus(Comment): + pass + soup = self.soup( - "<a><b>foo</b>bar</a>", - tag_class=TagPlus, string_class=StringPlus + "<a><b>foo</b>bar</a><!--whee-->", + element_classes = { + Tag: TagPlus, + NavigableString: StringPlus, + Comment: CommentPlus, + } ) - # The tree was built with TagPlus and StringPlus objects, - # rather than Tag and String objects. + # The tree was built with TagPlus, StringPlus, and CommentPlus objects, + # rather than Tag, String, and Comment objects. assert all( - isinstance(x, (TagPlus, StringPlus)) + isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.recursiveChildGenerator() ) |