summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/element.py40
-rw-r--r--bs4/tests/test_pageelement.py15
2 files changed, 51 insertions, 4 deletions
diff --git a/bs4/element.py b/bs4/element.py
index daffec3..1ad9c5a 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -955,12 +955,19 @@ class NavigableString(str, PageElement):
u.setup()
return u
- def __copy__(self):
+ def __copy__(self, recursive=False):
"""A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
+
+ :param recursive: This parameter is ignored; it's only defined
+ so that NavigableString implements the same signature as
+ Tag.
"""
return type(self)(self)
+ def __deepcopy__(self, memo):
+ return self.__copy__()
+
def __getnewargs__(self):
return (str(self),)
@@ -1305,9 +1312,14 @@ class Tag(PageElement):
parserClass = _alias("parser_class") # BS3
- def __copy__(self):
+ def __copy__(self, recursive=True):
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
+
+ For PageElements in a Beautiful Soup parse tree, __copy__ is
+ the same as __deepcopy__, because a given PageElement can only
+ be in one parse tree at a time. Thus, copying the element
+ requires creating a brand new element.
"""
clone = type(self)(
None, self.builder, self.name, self.namespace,
@@ -1320,10 +1332,30 @@ class Tag(PageElement):
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
- for child in self.contents:
- clone.append(child.__copy__())
+
+ if recursive:
+ # Clone this tag's descendants recursively, but without
+ # making any recursive function calls.
+ tag_stack = [clone]
+ for event, element in self._event_stream(self.descendants):
+ if event is Tag.END_ELEMENT_EVENT:
+ # Stop appending incoming Tags to the Tag that was
+ # just closed.
+ tag_stack.pop()
+ else:
+ descendant_clone = element.__copy__(recursive=False)
+ # Add to its parent's .contents
+ tag_stack[-1].append(descendant_clone)
+
+ if event is Tag.START_ELEMENT_EVENT:
+ # Add the Tag itself to the stack so that its
+ # children will be .appended to it.
+ tag_stack.append(descendant_clone)
return clone
+ def __deepcopy__(self, memo):
+ return self.__copy__()
+
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)
diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py
index d98c577..4567935 100644
--- a/bs4/tests/test_pageelement.py
+++ b/bs4/tests/test_pageelement.py
@@ -283,6 +283,21 @@ class TestPersistence(SoupTest):
copied = copy.deepcopy(self.tree)
assert copied.decode() == self.tree.decode()
+ def test_copy_deeply_nested_document(self):
+ # This test verifies that copy and deepcopy don't involve any
+ # recursive function calls. If they did, this test would
+ # overflow the Python interpreter stack.
+ limit = sys.getrecursionlimit() + 1
+ markup = "<span>" * limit
+
+ soup = self.soup(markup)
+
+ copied = copy.copy(soup)
+ assert soup.encode() == copied.encode()
+
+ copied = copy.deepcopy(soup)
+ assert soup.encode() == copied.encode()
+
def test_copy_preserves_encoding(self):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
encoding = soup.original_encoding