Make it possible to pickle a deeply nested BeautifulSoup object.

author: Leonard Richardson <leonardr@segfault.org> 2023-03-27 16:01:43 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2023-03-27 16:01:43 -0400
commit: c63a26a693c14234592b0f92da184a40aa9a2c6b (patch)
tree: 18ac38b27c480a6ff40b0cff95025ada913efe1e
parent: 469bd30fd1d981ea3e2af0d0827956532b41b277 (diff)
5 files changed, 58 insertions, 25 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 6e3a423..fc42551 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,17 +5,27 @@ Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7.
 
 = 4.12.1 (Unreleased)
 
-* Rewrote the code for converting a Beautiful Soup tree to a
-  string, so that it no longer makes recursive function calls. This
-  makes it possible to output documents that have more nested
-  tags than there are levels in the Python interpreter stack.
-  [bug=1471755]
-
-* The copy and deepcopy algorithms no longer use recursive function
-  calls, either, making it possible to copy deeply nested trees.
- 
-* Making a copy of a BeautifulSoup object no longer parses the document
-  again, which should improve performance significantly.
+* The main improvement in this version is a nonrecursive technique
+  for reconstructing the parser events that led to a particular
+  tree structure. This technique is used to avoid situations where,
+  in previous versions, doing something to a very deeply nested tree
+  would overflow the Python interpreter stack:
+
+  1. Outputting a parse tree as a string
+     (e.g. BeautifulSoup.encode()) [bug=1471755]
+
+  2. Making copies of parse trees (copy.copy() and
+     copy.deepcopy() from the Python standard library).
+
+  3. Pickling a BeautifulSoup object. (Note that pickling a Tag
+     object can still cause an overflow.)
+
+* Making a copy of a BeautifulSoup object no longer parses the
+  document again, which should improve performance significantly.
+
+* When a BeautifulSoup object is unpickled, Beautiful Soup now
+  tries to associate an appropriate TreeBuilder with it
+  rather than giving up.
 
 * Tag.prettify() will now consistently end prettified markup with
   a newline.
@@ -23,8 +33,8 @@ Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7.
 * Added unit tests for fuzz test cases created by third
   parties. Most of these tests are skipped since they either point
   out problems in code outside of Beautiful Soup, or problems with
-  Beautiful Soup that haven't been resolved yet, but this puts them
-  all in one convenient place.
+  Beautiful Soup that haven't been resolved yet; but this change
+  puts them all in one convenient place.
 
 = 4.12.0 (20230320)
 
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 9c7e67d..18d380b 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -38,6 +38,7 @@ from .builder import (
     builder_registry,
     ParserRejectedMarkup,
     XMLParsedAsHTMLWarning,
+    HTMLParserTreeBuilder
 )
 from .dammit import UnicodeDammit
 from .element import (
@@ -366,8 +367,32 @@ class BeautifulSoup(Tag):
         # Frequently a tree builder can't be pickled.
         d = dict(self.__dict__)
         if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
-            d['builder'] = None
+            d['builder'] = type(self.builder)
+        # Store the contents as a Unicode string.
+        d['contents'] = []
+        d['markup'] = self.decode()
+
+        # If _most_recent_element is present, it's a Tag object left
+        # over from initial parse. It might not be picklable and we
+        # don't need it.
+        if '_most_recent_element' in d:
+            del d['_most_recent_element']
         return d
+
+    def __setstate__(self, state):
+        # If necessary, restore the TreeBuilder by looking it up.
+        self.__dict__ = state
+        if isinstance(self.builder, type):
+            self.builder = self.builder()
+        elif not self.builder:
+            # We don't know which builder was used to build this
+            # parse tree, so use a default we know is always available.
+            self.builder = HTMLParserTreeBuilder()
+        self.builder.soup = self
+        self.reset()
+        self._feed()
+        return state
+
     
     @classmethod
     def _decode_markup(cls, markup):
diff --git a/bs4/element.py b/bs4/element.py
index 42b4a51..9c73957 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -911,7 +911,7 @@ class PageElement(object):
         :rtype: bool
         """
         return getattr(self, '_decomposed', False) or False
-
+   
     # Old non-property versions of the generators, for backwards
     # compatibility with BS3.
     def nextGenerator(self):
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index c7bf45d..5065b6f 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -189,13 +189,15 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
         assert soup.find('prefix:tag3').name == 'tag3'
         assert soup.subtag.find('prefix:tag3').name == 'tag3'
 
-    def test_pickle_removes_builder(self):
-        # The lxml TreeBuilder is not picklable, so it won't be
-        # preserved in a pickle/unpickle operation.
-
+    def test_pickle_restores_builder(self):
+        # The lxml TreeBuilder is not picklable, so when unpickling
+        # a document created with it, a new TreeBuilder of the
+        # appropriate class is created.
         soup = self.soup("<a>some markup</a>")
         assert isinstance(soup.builder, self.default_builder)
         pickled = pickle.dumps(soup)
         unpickled = pickle.loads(pickled)
+
         assert "some markup" == unpickled.a.string
-        assert unpickled.builder is None
+        assert unpickled.builder != soup.builder
+        assert isinstance(unpickled.builder, self.default_builder)
diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py
index b2773be..e12df79 100644
--- a/bs4/tests/test_pageelement.py
+++ b/bs4/tests/test_pageelement.py
@@ -277,7 +277,7 @@ class TestPersistence(SoupTest):
         loaded = pickle.loads(dumped)
         assert loaded.__class__ == BeautifulSoup
         assert loaded.decode() == self.tree.decode()
-
+        
     def test_deepcopy_identity(self):
         # Making a deepcopy of a tree yields an identical tree.
         copied = copy.deepcopy(self.tree)
@@ -291,13 +291,9 @@ class TestPersistence(SoupTest):
         markup = "<span>" * limit
 
         soup = self.soup(markup)
-        encoded = soup.encode()
         
         copied = copy.copy(soup)
-        assert encoded == copied.encode()
-
         copied = copy.deepcopy(soup)
-        assert encoded == copied.encode()
 
     def test_copy_preserves_encoding(self):
         soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
author	Leonard Richardson <leonardr@segfault.org>	2023-03-27 16:01:43 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2023-03-27 16:01:43 -0400
commit	c63a26a693c14234592b0f92da184a40aa9a2c6b (patch)
tree	18ac38b27c480a6ff40b0cff95025ada913efe1e
parent	469bd30fd1d981ea3e2af0d0827956532b41b277 (diff)