diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 11 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 7 |
2 files changed, 17 insertions, 1 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index bc611c9..308428a 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -236,7 +236,16 @@ class BeautifulSoup(Tag): self.builder.soup = None def __copy__(self): - return type(self)(self.encode(), builder=self.builder) + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy def __getstate__(self): # Frequently a tree builder can't be pickled. diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index fc19046..2f9aba1 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1328,6 +1328,13 @@ class TestPersistence(SoupTest): copied = copy.deepcopy(self.tree) self.assertEqual(copied.decode(), self.tree.decode()) + def test_copy_preserves_encoding(self): + soup = BeautifulSoup('<p> </p>', 'html.parser') + self.assertEqual('ascii', soup.original_encoding) + copy = soup.__copy__() + self.assertEqual(u"<p> </p>", unicode(copy)) + self.assertEqual('ascii', copy.original_encoding) + def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. html = u"<b>\N{SNOWMAN}</b>" |