summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt7
-rw-r--r--bs4/__init__.py3
-rw-r--r--bs4/builder/_html5lib.py12
-rw-r--r--bs4/element.py12
-rw-r--r--bs4/tests/test_tree.py26
-rw-r--r--doc/source/index.rst62
6 files changed, 110 insertions, 12 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 6eeebd2..7bc920e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -15,9 +15,10 @@ Especially important changes:
argument described in the documentation. `text` may eventually
change its meaning, but not for a very long time. [bug=1366856]
-* Changed the way soup objects work under copy.copy() and
- copy.deepcopy(). Copying a NavigableString will give you a new
- NavigableString that is not connected to the parse tree.
+* Changed the way soup objects work under copy.copy(). Copying a
+ NavigableString or a Tag will give you a new NavigableString that's
+ equal to the old one but not connected to the parse tree. Patch by
+ Martijn Peters. [bug=1307490]
* Started using a standard MIT license. [bug=1294662]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 68e7512..cb74bd3 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -79,6 +79,9 @@ class BeautifulSoup(Tag):
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+ def __copy__(self):
+ return type(self)(self.encode(), builder=self.builder)
+
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 4eaaaec..ab5793c 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -244,9 +244,9 @@ class Element(html5lib.treebuilders._base.Node):
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
- print "MOVE", self.element.contents
- print "FROM", self.element
- print "TO", new_parent.element
+ # print "MOVE", self.element.contents
+ # print "FROM", self.element
+ # print "TO", new_parent.element
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@@ -297,9 +297,9 @@ class Element(html5lib.treebuilders._base.Node):
element.contents = []
element.next_element = final_next_element
- print "DONE WITH MOVE"
- print "FROM", self.element
- print "TO", new_parent_element
+ # print "DONE WITH MOVE"
+ # print "FROM", self.element
+ # print "TO", new_parent_element
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
diff --git a/bs4/element.py b/bs4/element.py
index 0486da2..c70ad5a 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -815,6 +815,18 @@ class Tag(PageElement):
parserClass = _alias("parser_class") # BS3
+ def __copy__(self):
+ """A copy of a Tag is a new Tag, unconnected to the parse tree.
+ Its contents are a copy of the old Tag's contents.
+ """
+ clone = type(self)(None, self.builder, self.name, self.namespace,
+ self.nsprefix, self.attrs)
+ for attr in ('can_be_empty_element', 'hidden'):
+ setattr(clone, attr, getattr(self, attr))
+ for child in self.contents:
+ clone.append(child.__copy__())
+ return clone
+
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 22d4b4f..2371591 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -1330,6 +1330,32 @@ class TestPersistence(SoupTest):
self.assertEqual(s1, s2)
self.assertTrue(isinstance(s2, Comment))
+ def test_copy_entire_soup(self):
+ html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
+ soup = self.soup(html)
+ soup_copy = copy.copy(soup)
+ self.assertEqual(soup, soup_copy)
+
+ def test_copy_tag_copies_contents(self):
+ html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
+ soup = self.soup(html)
+ div = soup.div
+ div_copy = copy.copy(div)
+
+ # The two tags look the same, and evaluate to equal.
+ self.assertEqual(unicode(div), unicode(div_copy))
+ self.assertEqual(div, div_copy)
+
+ # But they're not the same object.
+ self.assertFalse(div is div_copy)
+
+ # And they don't have the same relation to the parse tree. The
+ # copy is not associated with a parse tree at all.
+ self.assertEqual(None, div_copy.parent)
+ self.assertEqual(None, div_copy.previous_element)
+ self.assertEqual(None, div_copy.find(string='Bar').next_element)
+ self.assertNotEqual(None, div.find(string='Bar').next_element)
+
class TestSubstitutions(SoupTest):
def test_default_formatter_is_minimal(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index f6d3e38..81659ed 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -1787,7 +1787,6 @@ attributes, and delete attributes::
tag
# <blockquote>Extremely bold</blockquote>
-
Modifying ``.string``
---------------------
@@ -2419,8 +2418,9 @@ as ``exclude_encodings``::
soup.original_encoding
'WINDOWS-1255'
-(This isn't 100% correct, but Windows-1255 is a compatible superset of
-ISO-8859-8, so it's close enough.)
+Windows-1255 isn't 100% correct, but that encoding is a compatible
+superset of ISO-8859-8, so it's close enough. (``exclude_encodings``
+is a new feature in Beautiful Soup 4.4.0.)
In rare cases (usually when a UTF-8 document contains text written in
a completely different encoding), the only way to get Unicode may be
@@ -2609,6 +2609,62 @@ document is Windows-1252, and the document will come out looking like
``UnicodeDammit.detwingle()`` is new in Beautiful Soup 4.1.0.
+
+Comparing objects for equality
+==============================
+
+Beautiful Soup says that two ``NavigableString`` or ``Tag`` objects
+are equal when they represent the same HTML or XML markup. In this
+example, the two <b> tags are treated as equal, even though they live
+in different parts of the object tree, because they both look like
+"<b>pizza</b>"::
+
+ markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>"
+ soup = BeautifulSoup(markup, 'html.parser')
+ first_b, second_b = soup.find_all('b')
+ print first_b == second_b
+ # True
+
+ print first_b.previous_element == second_b.previous_element
+ # False
+
+If you want to see whether two variables refer to exactly the same
+object, use `is`::
+
+ print first_b is second_b
+ # False
+
+Copying Beautiful Soup objects
+==============================
+
+You can use ``copy.copy()`` to create a copy of any ``Tag`` or
+``NavigableString``::
+
+ import copy
+ p_copy = copy.copy(soup.p)
+ print p_copy
+ # <p>I want <b>pizza</b> and more <b>pizza</b>!</p>
+
+The copy is considered equal to the original, since it represents the
+same markup as the original, but it's not the same object::
+
+ print soup.p == p_copy
+ # True
+
+ print soup.p is p_copy
+ # False
+
+The only real difference is that the copy is completely detached from
+the original Beautiful Soup object tree, just as if ``extract()`` had
+been called on it::
+
+ print p_copy.parent
+ # None
+
+This is because two different ``Tag`` objects can't occupy the same
+space at the same time.
+
+
Parsing only part of a document
===============================