summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/__init__.py27
-rw-r--r--bs4/element.py2
-rw-r--r--bs4/tests/test_lxml.py12
-rw-r--r--bs4/tests/test_pageelement.py6
4 files changed, 35 insertions, 12 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 9c7e67d..18d380b 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -38,6 +38,7 @@ from .builder import (
builder_registry,
ParserRejectedMarkup,
XMLParsedAsHTMLWarning,
+ HTMLParserTreeBuilder
)
from .dammit import UnicodeDammit
from .element import (
@@ -366,8 +367,32 @@ class BeautifulSoup(Tag):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
- d['builder'] = None
+ d['builder'] = type(self.builder)
+ # Store the contents as a Unicode string.
+ d['contents'] = []
+ d['markup'] = self.decode()
+
+ # If _most_recent_element is present, it's a Tag object left
+ # over from initial parse. It might not be picklable and we
+ # don't need it.
+ if '_most_recent_element' in d:
+ del d['_most_recent_element']
return d
+
+ def __setstate__(self, state):
+ # If necessary, restore the TreeBuilder by looking it up.
+ self.__dict__ = state
+ if isinstance(self.builder, type):
+ self.builder = self.builder()
+ elif not self.builder:
+ # We don't know which builder was used to build this
+ # parse tree, so use a default we know is always available.
+ self.builder = HTMLParserTreeBuilder()
+ self.builder.soup = self
+ self.reset()
+ self._feed()
+ return state
+
@classmethod
def _decode_markup(cls, markup):
diff --git a/bs4/element.py b/bs4/element.py
index 42b4a51..9c73957 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -911,7 +911,7 @@ class PageElement(object):
:rtype: bool
"""
return getattr(self, '_decomposed', False) or False
-
+
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index c7bf45d..5065b6f 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -189,13 +189,15 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
assert soup.find('prefix:tag3').name == 'tag3'
assert soup.subtag.find('prefix:tag3').name == 'tag3'
- def test_pickle_removes_builder(self):
- # The lxml TreeBuilder is not picklable, so it won't be
- # preserved in a pickle/unpickle operation.
-
+ def test_pickle_restores_builder(self):
+ # The lxml TreeBuilder is not picklable, so when unpickling
+ # a document created with it, a new TreeBuilder of the
+ # appropriate class is created.
soup = self.soup("<a>some markup</a>")
assert isinstance(soup.builder, self.default_builder)
pickled = pickle.dumps(soup)
unpickled = pickle.loads(pickled)
+
assert "some markup" == unpickled.a.string
- assert unpickled.builder is None
+ assert unpickled.builder != soup.builder
+ assert isinstance(unpickled.builder, self.default_builder)
diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py
index b2773be..e12df79 100644
--- a/bs4/tests/test_pageelement.py
+++ b/bs4/tests/test_pageelement.py
@@ -277,7 +277,7 @@ class TestPersistence(SoupTest):
loaded = pickle.loads(dumped)
assert loaded.__class__ == BeautifulSoup
assert loaded.decode() == self.tree.decode()
-
+
def test_deepcopy_identity(self):
# Making a deepcopy of a tree yields an identical tree.
copied = copy.deepcopy(self.tree)
@@ -291,13 +291,9 @@ class TestPersistence(SoupTest):
markup = "<span>" * limit
soup = self.soup(markup)
- encoded = soup.encode()
copied = copy.copy(soup)
- assert encoded == copied.encode()
-
copied = copy.deepcopy(soup)
- assert encoded == copied.encode()
def test_copy_preserves_encoding(self):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')