summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-04-05 15:43:58 -0400
commita6f897b213bb08f0d8d8a1528937541c280abbd6 (patch)
tree866d3392a854ea27a172e9b456b2160307e39363 /bs4/__init__.py
parentddadf13ef66122d75eadaf7f10e0937429e6a3a6 (diff)
Embedded CSS and Javascript is now stored in distinct Stylesheet and
Script tags, which are ignored by methods like get_text(). This feature is not supported by the html5lib treebuilder. [bug=1868861]
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py40
1 files changed, 26 insertions, 14 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index f828cd2..bae7fda 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -15,8 +15,8 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.8.2"
-__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+__version__ = "4.9.0"
+__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@@ -423,6 +423,7 @@ class BeautifulSoup(Tag):
self.currentTag = None
self.tagStack = []
self.preserve_whitespace_tag_stack = []
+ self.string_container_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
@@ -434,14 +435,28 @@ class BeautifulSoup(Tag):
sourceline=sourceline, sourcepos=sourcepos
)
+ def string_container(self, base_class=None):
+ container = base_class or NavigableString
+
+ # There may be a general override of NavigableString.
+ container = self.element_classes.get(
+ container, container
+ )
+
+ # On top of that, we may be inside a tag that needs a special
+ # container class.
+ if self.string_container_stack:
+ container = self.builder.string_containers.get(
+ self.string_container_stack[-1].name, container
+ )
+ return container
+
def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this BeautifulSoup
object.
"""
- subclass = subclass or self.element_classes.get(
- NavigableString, NavigableString
- )
- return subclass(s)
+ container = self.string_container(subclass)
+ return container(s)
def insert_before(self, successor):
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
@@ -460,6 +475,8 @@ class BeautifulSoup(Tag):
tag = self.tagStack.pop()
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
+ if self.string_container_stack and tag == self.string_container_stack[-1]:
+ self.string_container_stack.pop()
#print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
@@ -474,19 +491,14 @@ class BeautifulSoup(Tag):
self.currentTag = self.tagStack[-1]
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
+ if tag.name in self.builder.string_containers:
+ self.string_container_stack.append(tag)
def endData(self, containerClass=None):
"""Method called by the TreeBuilder when the end of a data segment
occurs.
"""
- # Default container is NavigableString.
- containerClass = containerClass or NavigableString
-
- # The user may want us to instantiate some alias for the
- # container class.
- containerClass = self.element_classes.get(
- containerClass, containerClass
- )
+ containerClass = self.string_container(containerClass)
if self.current_data:
current_data = u''.join(self.current_data)