diff options
author | Leonard Richardson <leonardr@segfault.org> | 2015-06-28 15:58:48 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2015-06-28 15:58:48 -0400 |
commit | 007aa56a1922eea9f364bf3b73e72077046e2c69 (patch) | |
tree | 709684c6357fc69013c5d39b026b1d303da9a40b | |
parent | 9428b9d6ed0d279a72414a986290821ca4f0caaf (diff) |
It's now possible to pickle a BeautifulSoup object no matter which
tree builder was used to create it. However, the only tree builder
that survives the pickling process is the HTMLParserTreeBuilder
('html.parser'). If you unpickle a BeautifulSoup object created with
some other tree builder, soup.builder will be None. [bug=1231545]
-rw-r--r-- | NEWS.txt | 6 | ||||
-rw-r--r-- | bs4/__init__.py | 13 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 1 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 1 | ||||
-rw-r--r-- | bs4/testing.py | 19 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 13 |
6 files changed, 50 insertions, 3 deletions
@@ -98,6 +98,12 @@ Bug fixes: is used in select(). Previously some cases did not result in a NotImplementedError. +* It's now possible to pickle a BeautifulSoup object no matter which + tree builder was used to create it. However, the only tree builder + that survives the pickling process is the HTMLParserTreeBuilder + ('html.parser'). If you unpickle a BeautifulSoup object created with + some other tree builder, soup.builder will be None. [bug=1231545] + = 4.3.2 (20131002) = * Fixed a bug in which short Unicode input was improperly encoded to diff --git a/bs4/__init__.py b/bs4/__init__.py index cb74bd3..b861d87 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -79,9 +79,6 @@ class BeautifulSoup(Tag): NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" - def __copy__(self): - return type(self)(self.encode(), builder=self.builder) - def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): @@ -225,6 +222,16 @@ class BeautifulSoup(Tag): self.markup = None self.builder.soup = None + def __copy__(self): + return type(self)(self.encode(), builder=self.builder) + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + del d['builder'] + return d + def _feed(self): # Convert the document to Unicode. self.builder.reset() diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 820bc80..f8fce56 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -85,6 +85,7 @@ class TreeBuilder(object): features = [] is_xml = False + picklable = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 25811f1..0101d64 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -127,6 +127,7 @@ class BeautifulSoupHTMLParser(HTMLParser): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False + picklable = True NAME = HTMLPARSER features = [NAME, HTML, STRICT] diff --git a/bs4/testing.py b/bs4/testing.py index 7232513..9e5e295 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -1,5 +1,6 @@ """Helper classes for tests.""" +import pickle import copy import functools import unittest @@ -64,6 +65,15 @@ class HTMLTreeBuilderSmokeTest(object): markup in these tests, there's not much room for interpretation. """ + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + def assertDoctypeHandled(self, doctype_fragment): """Assert that a given doctype string is handled correctly.""" doctype_str, soup = self._document_with_doctype(doctype_fragment) @@ -532,6 +542,15 @@ Hello, world! class XMLTreeBuilderSmokeTest(object): + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + def test_docstring_generated(self): soup = self.soup("<root/>") self.assertEqual( diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index bcb5ed2..b45e35f 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -1,6 +1,8 @@ """Tests to ensure that the html.parser tree builder generates good trees.""" +from pdb import set_trace +import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder @@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass + + def test_builder_is_pickled(self): + """Unlike most tree builders, HTMLParserTreeBuilder and will + be restored after pickling. + """ + tree = self.soup("<a><b>foo</a>") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertTrue(isinstance(loaded.builder, type(tree.builder))) + + |