diff options
author | Leonard Richardson <leonardr@segfault.org> | 2023-03-27 16:01:43 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2023-03-27 16:01:43 -0400 |
commit | c63a26a693c14234592b0f92da184a40aa9a2c6b (patch) | |
tree | 18ac38b27c480a6ff40b0cff95025ada913efe1e /bs4/__init__.py | |
parent | 469bd30fd1d981ea3e2af0d0827956532b41b277 (diff) |
Make it possible to pickle a deeply nested BeautifulSoup object.
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r-- | bs4/__init__.py | 27 |
1 files changed, 26 insertions, 1 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 9c7e67d..18d380b 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -38,6 +38,7 @@ from .builder import ( builder_registry, ParserRejectedMarkup, XMLParsedAsHTMLWarning, + HTMLParserTreeBuilder ) from .dammit import UnicodeDammit from .element import ( @@ -366,8 +367,32 @@ class BeautifulSoup(Tag): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) if 'builder' in d and d['builder'] is not None and not self.builder.picklable: - d['builder'] = None + d['builder'] = type(self.builder) + # Store the contents as a Unicode string. + d['contents'] = [] + d['markup'] = self.decode() + + # If _most_recent_element is present, it's a Tag object left + # over from initial parse. It might not be picklable and we + # don't need it. + if '_most_recent_element' in d: + del d['_most_recent_element'] return d + + def __setstate__(self, state): + # If necessary, restore the TreeBuilder by looking it up. + self.__dict__ = state + if isinstance(self.builder, type): + self.builder = self.builder() + elif not self.builder: + # We don't know which builder was used to build this + # parse tree, so use a default we know is always available. + self.builder = HTMLParserTreeBuilder() + self.builder.soup = self + self.reset() + self._feed() + return state + @classmethod def _decode_markup(cls, markup): |