summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2023-03-27 16:01:43 -0400
committerLeonard Richardson <leonardr@segfault.org>2023-03-27 16:01:43 -0400
commitc63a26a693c14234592b0f92da184a40aa9a2c6b (patch)
tree18ac38b27c480a6ff40b0cff95025ada913efe1e /bs4/__init__.py
parent469bd30fd1d981ea3e2af0d0827956532b41b277 (diff)
Make it possible to pickle a deeply nested BeautifulSoup object.
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py27
1 files changed, 26 insertions, 1 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 9c7e67d..18d380b 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -38,6 +38,7 @@ from .builder import (
builder_registry,
ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
+ HTMLParserTreeBuilder
)
from .dammit import UnicodeDammit
from .element import (
@@ -366,8 +367,32 @@ class BeautifulSoup(Tag):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
- d['builder'] = None
+ d['builder'] = type(self.builder)
+ # Store the contents as a Unicode string.
+ d['contents'] = []
+ d['markup'] = self.decode()
+
+ # If _most_recent_element is present, it's a Tag object left
+ # over from initial parse. It might not be picklable and we
+ # don't need it.
+ if '_most_recent_element' in d:
+ del d['_most_recent_element']
return d
+
+ def __setstate__(self, state):
+ # If necessary, restore the TreeBuilder by looking it up.
+ self.__dict__ = state
+ if isinstance(self.builder, type):
+ self.builder = self.builder()
+ elif not self.builder:
+ # We don't know which builder was used to build this
+ # parse tree, so use a default we know is always available.
+ self.builder = HTMLParserTreeBuilder()
+ self.builder.soup = self
+ self.reset()
+ self._feed()
+ return state
+
@classmethod
def _decode_markup(cls, markup):