summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/__init__.py13
-rw-r--r--bs4/builder/__init__.py1
-rw-r--r--bs4/builder/_htmlparser.py1
-rw-r--r--bs4/testing.py19
-rw-r--r--bs4/tests/test_htmlparser.py13
6 files changed, 50 insertions, 3 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 7bc920e..47d4942 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -98,6 +98,12 @@ Bug fixes:
is used in select(). Previously some cases did not result in a
NotImplementedError.
+* It's now possible to pickle a BeautifulSoup object no matter which
+ tree builder was used to create it. However, the only tree builder
+ that survives the pickling process is the HTMLParserTreeBuilder
+ ('html.parser'). If you unpickle a BeautifulSoup object created with
+ some other tree builder, soup.builder will be None. [bug=1231545]
+
= 4.3.2 (20131002) =
* Fixed a bug in which short Unicode input was improperly encoded to
diff --git a/bs4/__init__.py b/bs4/__init__.py
index cb74bd3..b861d87 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -79,9 +79,6 @@ class BeautifulSoup(Tag):
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
- def __copy__(self):
- return type(self)(self.encode(), builder=self.builder)
-
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
@@ -225,6 +222,16 @@ class BeautifulSoup(Tag):
self.markup = None
self.builder.soup = None
+ def __copy__(self):
+ return type(self)(self.encode(), builder=self.builder)
+
+ def __getstate__(self):
+ # Frequently a tree builder can't be pickled.
+ d = dict(self.__dict__)
+ if 'builder' in d and not self.builder.picklable:
+ del d['builder']
+ return d
+
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index 820bc80..f8fce56 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -85,6 +85,7 @@ class TreeBuilder(object):
features = []
is_xml = False
+ picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 25811f1..0101d64 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -127,6 +127,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
+ picklable = True
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
diff --git a/bs4/testing.py b/bs4/testing.py
index 7232513..9e5e295 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -1,5 +1,6 @@
"""Helper classes for tests."""
+import pickle
import copy
import functools
import unittest
@@ -64,6 +65,15 @@ class HTMLTreeBuilderSmokeTest(object):
markup in these tests, there's not much room for interpretation.
"""
+ def test_pickle_and_unpickle_identity(self):
+ # Pickling a tree, then unpickling it, yields a tree identical
+ # to the original.
+ tree = self.soup("<a><b>foo</a>")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.__class__, BeautifulSoup)
+ self.assertEqual(loaded.decode(), tree.decode())
+
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
@@ -532,6 +542,15 @@ Hello, world!
class XMLTreeBuilderSmokeTest(object):
+ def test_pickle_and_unpickle_identity(self):
+ # Pickling a tree, then unpickling it, yields a tree identical
+ # to the original.
+ tree = self.soup("<a><b>foo</a>")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.__class__, BeautifulSoup)
+ self.assertEqual(loaded.decode(), tree.decode())
+
def test_docstring_generated(self):
soup = self.soup("<root/>")
self.assertEqual(
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index bcb5ed2..b45e35f 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -1,6 +1,8 @@
"""Tests to ensure that the html.parser tree builder generates good
trees."""
+from pdb import set_trace
+import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
@@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
+
+ def test_builder_is_pickled(self):
+ """Unlike most tree builders, HTMLParserTreeBuilder and will
+ be restored after pickling.
+ """
+ tree = self.soup("<a><b>foo</a>")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
+
+