diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 27 | ||||
-rw-r--r-- | bs4/element.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 12 | ||||
-rw-r--r-- | bs4/tests/test_pageelement.py | 6 |
4 files changed, 35 insertions, 12 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 9c7e67d..18d380b 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -38,6 +38,7 @@ from .builder import ( builder_registry, ParserRejectedMarkup, XMLParsedAsHTMLWarning, + HTMLParserTreeBuilder ) from .dammit import UnicodeDammit from .element import ( @@ -366,8 +367,32 @@ class BeautifulSoup(Tag): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) if 'builder' in d and d['builder'] is not None and not self.builder.picklable: - d['builder'] = None + d['builder'] = type(self.builder) + # Store the contents as a Unicode string. + d['contents'] = [] + d['markup'] = self.decode() + + # If _most_recent_element is present, it's a Tag object left + # over from initial parse. It might not be picklable and we + # don't need it. + if '_most_recent_element' in d: + del d['_most_recent_element'] return d + + def __setstate__(self, state): + # If necessary, restore the TreeBuilder by looking it up. + self.__dict__ = state + if isinstance(self.builder, type): + self.builder = self.builder() + elif not self.builder: + # We don't know which builder was used to build this + # parse tree, so use a default we know is always available. + self.builder = HTMLParserTreeBuilder() + self.builder.soup = self + self.reset() + self._feed() + return state + @classmethod def _decode_markup(cls, markup): diff --git a/bs4/element.py b/bs4/element.py index 42b4a51..9c73957 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -911,7 +911,7 @@ class PageElement(object): :rtype: bool """ return getattr(self, '_decomposed', False) or False - + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index c7bf45d..5065b6f 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -189,13 +189,15 @@ class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): assert soup.find('prefix:tag3').name == 'tag3' assert soup.subtag.find('prefix:tag3').name == 'tag3' - def test_pickle_removes_builder(self): - # The lxml TreeBuilder is not picklable, so it won't be - # preserved in a pickle/unpickle operation. - + def test_pickle_restores_builder(self): + # The lxml TreeBuilder is not picklable, so when unpickling + # a document created with it, a new TreeBuilder of the + # appropriate class is created. soup = self.soup("<a>some markup</a>") assert isinstance(soup.builder, self.default_builder) pickled = pickle.dumps(soup) unpickled = pickle.loads(pickled) + assert "some markup" == unpickled.a.string - assert unpickled.builder is None + assert unpickled.builder != soup.builder + assert isinstance(unpickled.builder, self.default_builder) diff --git a/bs4/tests/test_pageelement.py b/bs4/tests/test_pageelement.py index b2773be..e12df79 100644 --- a/bs4/tests/test_pageelement.py +++ b/bs4/tests/test_pageelement.py @@ -277,7 +277,7 @@ class TestPersistence(SoupTest): loaded = pickle.loads(dumped) assert loaded.__class__ == BeautifulSoup assert loaded.decode() == self.tree.decode() - + def test_deepcopy_identity(self): # Making a deepcopy of a tree yields an identical tree. copied = copy.deepcopy(self.tree) @@ -291,13 +291,9 @@ class TestPersistence(SoupTest): markup = "<span>" * limit soup = self.soup(markup) - encoded = soup.encode() copied = copy.copy(soup) - assert encoded == copied.encode() - copied = copy.deepcopy(soup) - assert encoded == copied.encode() def test_copy_preserves_encoding(self): soup = BeautifulSoup(b'<p> </p>', 'html.parser') |