diff options
author | Leonard Richardson <leonardr@segfault.org> | 2023-03-24 16:19:29 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2023-03-24 16:19:29 -0400 |
commit | 54b0a2e35c575bcb525c38374e53ea68d53b5e6d (patch) | |
tree | f88af97287d21b4c3c8ba44d16483fec6cdfc924 | |
parent | 8944fe70574914cabfc9e6fb6eb048d71be39fb1 (diff) |
Implement nonrecursive versions of copy and deepcopy using the new _event_strem generator.
-rw-r--r-- | bs4/element.py | 40 | ||||
-rw-r--r-- | bs4/tests/test_pageelement.py | 15 |
2 files changed, 51 insertions, 4 deletions
diff --git a/bs4/element.py b/bs4/element.py index daffec3..1ad9c5a 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -955,12 +955,19 @@ class NavigableString(str, PageElement): u.setup() return u - def __copy__(self): + def __copy__(self, recursive=False): """A copy of a NavigableString has the same contents and class as the original, but it is not connected to the parse tree. + + :param recursive: This parameter is ignored; it's only defined + so that NavigableString implements the same signature as + Tag. """ return type(self)(self) + def __deepcopy__(self, memo): + return self.__copy__() + def __getnewargs__(self): return (str(self),) @@ -1305,9 +1312,14 @@ class Tag(PageElement): parserClass = _alias("parser_class") # BS3 - def __copy__(self): + def __copy__(self, recursive=True): """A copy of a Tag is a new Tag, unconnected to the parse tree. Its contents are a copy of the old Tag's contents. + + For PageElements in a Beautiful Soup parse tree, __copy__ is + the same as __deepcopy__, because a given PageElement can only + be in one parse tree at a time. Thus, copying the element + requires creating a brand new element. """ clone = type(self)( None, self.builder, self.name, self.namespace, @@ -1320,10 +1332,30 @@ class Tag(PageElement): ) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) - for child in self.contents: - clone.append(child.__copy__()) + + if recursive: + # Clone this tag's descendants recursively, but without + # making any recursive function calls. + tag_stack = [clone] + for event, element in self._event_stream(self.descendants): + if event is Tag.END_ELEMENT_EVENT: + # Stop appending incoming Tags to the Tag that was + # just closed. + tag_stack.pop() + else: + descendant_clone = element.__copy__(recursive=False) + # Add to its parent's .contents + tag_stack[-1].append(descendant_clone) + + if event is Tag.START_ELEMENT_EVENT: + # Add the Tag itself to the stack so that its + # children will be .appended to it. + tag_stack.append(descendant_clone) return clone + def __deepcopy__(self, memo): + return self.__copy__() + @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py index d98c577..4567935 100644 --- a/bs4/tests/test_pageelement.py +++ b/bs4/tests/test_pageelement.py @@ -283,6 +283,21 @@ class TestPersistence(SoupTest): copied = copy.deepcopy(self.tree) assert copied.decode() == self.tree.decode() + def test_copy_deeply_nested_document(self): + # This test verifies that copy and deepcopy don't involve any + # recursive function calls. If they did, this test would + # overflow the Python interpreter stack. + limit = sys.getrecursionlimit() + 1 + markup = "<span>" * limit + + soup = self.soup(markup) + + copied = copy.copy(soup) + assert soup.encode() == copied.encode() + + copied = copy.deepcopy(soup) + assert soup.encode() == copied.encode() + def test_copy_preserves_encoding(self): soup = BeautifulSoup(b'<p> </p>', 'html.parser') encoding = soup.original_encoding |