Implement a proper BeautifulSoup.deepcopy rather than parsing the document again.

author: Leonard Richardson <leonardr@segfault.org> 2023-03-26 13:27:11 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2023-03-26 13:27:11 -0400
commit: 544a069c2d802f726c888e9f921f7dc9b79f341a (patch)
tree: 11f74078fe7117d393e0367827e96d71bcc4b883
parent: f7dbd541b2473f5e72b818bb64cea434c39d0725 (diff)
3 files changed, 47 insertions, 28 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 74619a1..6e3a423 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,8 +11,14 @@ Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7.
   tags than there are levels in the Python interpreter stack.
   [bug=1471755]
 
+* The copy and deepcopy algorithms no longer use recursive function
+  calls, either, making it possible to copy deeply nested trees.
+ 
+* Making a copy of a BeautifulSoup object no longer parses the document
+  again, which should improve performance significantly.
+
 * Tag.prettify() will now consistently end prettified markup with
-  a newline. This is a side effect of the work done for bug #1471755.
+  a newline.
 
 * Added unit tests for fuzz test cases created by third
   parties. Most of these tests are skipped since they either point
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 5e1bebe..9c7e67d 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -117,7 +117,7 @@ class BeautifulSoup(Tag):
     ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
 
     NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-    
+   
     def __init__(self, markup="", features=None, builder=None,
                  parse_only=None, from_encoding=None, exclude_encodings=None,
                  element_classes=None, **kwargs):
@@ -349,19 +349,19 @@ class BeautifulSoup(Tag):
         self.markup = None
         self.builder.soup = None
 
-    def __copy__(self):
-        """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
-        copy = type(self)(
-            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
-        )
+    def _clone(self):
+        """Create a new BeautifulSoup object with the same TreeBuilder,
+        but not associated with any markup.
 
-        # Although we encoded the tree to UTF-8, that may not have
-        # been the encoding of the original markup. Set the copy's
-        # .original_encoding to reflect the original object's
-        # .original_encoding.
-        copy.original_encoding = self.original_encoding
-        return copy
+        This is the first step of the deepcopy process.
+        """
+        clone = type(self)("", None, self.builder)
 
+        # Keep track of the encoding of the original document,
+        # since we won't be parsing it again.
+        clone.original_encoding = self.original_encoding
+        return clone
+        
     def __getstate__(self):
         # Frequently a tree builder can't be pickled.
         d = dict(self.__dict__)
diff --git a/bs4/element.py b/bs4/element.py
index 018f2b3..e75d326 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -966,7 +966,7 @@ class NavigableString(str, PageElement):
         return type(self)(self)
 
     def __copy__(self):
-        return self.__deepcopy__()
+        return self.__deepcopy__({})
 
     def __getnewargs__(self):
         return (str(self),)
@@ -1312,21 +1312,11 @@ class Tag(PageElement):
 
     parserClass = _alias("parser_class")  # BS3
 
-    def __deepcopy__(self, recursive=True):
+    def __deepcopy__(self, memo, recursive=True):
         """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
         Its contents are a copy of the old Tag's contents.
         """
-        clone = type(self)(
-            None, self.builder, self.name, self.namespace,
-            self.prefix, self.attrs, is_xml=self._is_xml,
-            sourceline=self.sourceline, sourcepos=self.sourcepos,
-            can_be_empty_element=self.can_be_empty_element,
-            cdata_list_attributes=self.cdata_list_attributes,
-            preserve_whitespace_tags=self.preserve_whitespace_tags,
-            interesting_string_types=self.interesting_string_types
-        )
-        for attr in ('can_be_empty_element', 'hidden'):
-            setattr(clone, attr, getattr(self, attr))
+        clone = self._clone()
 
         if recursive:
             # Clone this tag's descendants recursively, but without
@@ -1338,7 +1328,9 @@ class Tag(PageElement):
                     # just closed.
                     tag_stack.pop()
                 else:
-                    descendant_clone = element.__copy__(recursive=False)
+                    descendant_clone = element.__deepcopy__(
+                        memo, recursive=False
+                    )
                     # Add to its parent's .contents
                     tag_stack[-1].append(descendant_clone)
 
@@ -1349,8 +1341,29 @@ class Tag(PageElement):
         return clone
 
     def __copy__(self):
-        return self.__deepcopy__()
+        # A copy of a Tag must always be a deep copy, because the
+        # Tag's children can only have one parent at a time.
+        return self.__deepcopy__({})
+
+    def _clone(self):
+        """Create a new Tag just like this one, but with no
+        contents and unattached to any parse tree.
 
+        This is the first step in the deepcopy process.
+        """
+        clone = type(self)(
+            None, self.builder, self.name, self.namespace,
+            self.prefix, self.attrs, is_xml=self._is_xml,
+            sourceline=self.sourceline, sourcepos=self.sourcepos,
+            can_be_empty_element=self.can_be_empty_element,
+            cdata_list_attributes=self.cdata_list_attributes,
+            preserve_whitespace_tags=self.preserve_whitespace_tags,
+            interesting_string_types=self.interesting_string_types
+        )
+        for attr in ('can_be_empty_element', 'hidden'):
+            setattr(clone, attr, getattr(self, attr))
+        return clone
+    
     @property
     def is_empty_element(self):
         """Is this tag an empty-element tag? (aka a self-closing tag)
author	Leonard Richardson <leonardr@segfault.org>	2023-03-26 13:27:11 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2023-03-26 13:27:11 -0400
commit	544a069c2d802f726c888e9f921f7dc9b79f341a (patch)
tree	11f74078fe7117d393e0367827e96d71bcc4b883
parent	f7dbd541b2473f5e72b818bb64cea434c39d0725 (diff)