It's now possible to pickle a BeautifulSoup object no matter which

tree builder was used to create it. However, the only tree builder that survives the pickling process is the HTMLParserTreeBuilder ('html.parser'). If you unpickle a BeautifulSoup object created with some other tree builder, soup.builder will be None. [bug=1231545]
author: Leonard Richardson <leonardr@segfault.org> 2015-06-28 15:58:48 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2015-06-28 15:58:48 -0400
commit: 007aa56a1922eea9f364bf3b73e72077046e2c69 (patch)
tree: 709684c6357fc69013c5d39b026b1d303da9a40b
parent: 9428b9d6ed0d279a72414a986290821ca4f0caaf (diff)
6 files changed, 50 insertions, 3 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 7bc920e..47d4942 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -98,6 +98,12 @@ Bug fixes:
   is used in select(). Previously some cases did not result in a
   NotImplementedError.
 
+* It's now possible to pickle a BeautifulSoup object no matter which
+  tree builder was used to create it. However, the only tree builder
+  that survives the pickling process is the HTMLParserTreeBuilder
+  ('html.parser'). If you unpickle a BeautifulSoup object created with
+  some other tree builder, soup.builder will be None. [bug=1231545]
+
 = 4.3.2 (20131002) =
 
 * Fixed a bug in which short Unicode input was improperly encoded to
diff --git a/bs4/__init__.py b/bs4/__init__.py
index cb74bd3..b861d87 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -79,9 +79,6 @@ class BeautifulSoup(Tag):
 
     NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
 
-    def __copy__(self):
-        return type(self)(self.encode(), builder=self.builder)
-
     def __init__(self, markup="", features=None, builder=None,
                  parse_only=None, from_encoding=None, exclude_encodings=None,
                  **kwargs):
@@ -225,6 +222,16 @@ class BeautifulSoup(Tag):
         self.markup = None
         self.builder.soup = None
 
+    def __copy__(self):
+        return type(self)(self.encode(), builder=self.builder)
+
+    def __getstate__(self):
+        # Frequently a tree builder can't be pickled.
+        d = dict(self.__dict__)
+        if 'builder' in d and not self.builder.picklable:
+            del d['builder']
+        return d
+
     def _feed(self):
         # Convert the document to Unicode.
         self.builder.reset()
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 820bc80..f8fce56 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -85,6 +85,7 @@ class TreeBuilder(object):
     features = []
 
     is_xml = False
+    picklable = False
     preserve_whitespace_tags = set()
     empty_element_tags = None # A tag will be considered an empty-element
                               # tag when and only when it has no contents.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 25811f1..0101d64 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -127,6 +127,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):
 
     is_xml = False
+    picklable = True
     NAME = HTMLPARSER
     features = [NAME, HTML, STRICT]
 
diff --git a/bs4/testing.py b/bs4/testing.py
index 7232513..9e5e295 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -1,5 +1,6 @@
 """Helper classes for tests."""
 
+import pickle
 import copy
 import functools
 import unittest
@@ -64,6 +65,15 @@ class HTMLTreeBuilderSmokeTest(object):
     markup in these tests, there's not much room for interpretation.
     """
 
+    def test_pickle_and_unpickle_identity(self):
+        # Pickling a tree, then unpickling it, yields a tree identical
+        # to the original.
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertEqual(loaded.__class__, BeautifulSoup)
+        self.assertEqual(loaded.decode(), tree.decode())
+
     def assertDoctypeHandled(self, doctype_fragment):
         """Assert that a given doctype string is handled correctly."""
         doctype_str, soup = self._document_with_doctype(doctype_fragment)
@@ -532,6 +542,15 @@ Hello, world!
 
 class XMLTreeBuilderSmokeTest(object):
 
+    def test_pickle_and_unpickle_identity(self):
+        # Pickling a tree, then unpickling it, yields a tree identical
+        # to the original.
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertEqual(loaded.__class__, BeautifulSoup)
+        self.assertEqual(loaded.decode(), tree.decode())
+
     def test_docstring_generated(self):
         soup = self.soup("<root/>")
         self.assertEqual(
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index bcb5ed2..b45e35f 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -1,6 +1,8 @@
 """Tests to ensure that the html.parser tree builder generates good
 trees."""
 
+from pdb import set_trace
+import pickle
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder
 
@@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
     def test_namespaced_public_doctype(self):
         # html.parser can't handle namespaced doctypes, so skip this one.
         pass
+
+    def test_builder_is_pickled(self):
+        """Unlike most tree builders, HTMLParserTreeBuilder and will
+        be restored after pickling.
+        """
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
+
+
author	Leonard Richardson <leonardr@segfault.org>	2015-06-28 15:58:48 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2015-06-28 15:58:48 -0400
commit	007aa56a1922eea9f364bf3b73e72077046e2c69 (patch)
tree	709684c6357fc69013c5d39b026b1d303da9a40b
parent	9428b9d6ed0d279a72414a986290821ca4f0caaf (diff)