diff options
-rw-r--r-- | NEWS.txt | 7 | ||||
-rw-r--r-- | bs4/__init__.py | 3 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 12 | ||||
-rw-r--r-- | bs4/element.py | 12 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 26 | ||||
-rw-r--r-- | doc/source/index.rst | 62 |
6 files changed, 110 insertions, 12 deletions
@@ -15,9 +15,10 @@ Especially important changes: argument described in the documentation. `text` may eventually change its meaning, but not for a very long time. [bug=1366856] -* Changed the way soup objects work under copy.copy() and - copy.deepcopy(). Copying a NavigableString will give you a new - NavigableString that is not connected to the parse tree. +* Changed the way soup objects work under copy.copy(). Copying a + NavigableString or a Tag will give you a new NavigableString that's + equal to the old one but not connected to the parse tree. Patch by + Martijn Peters. [bug=1307490] * Started using a standard MIT license. [bug=1294662] diff --git a/bs4/__init__.py b/bs4/__init__.py index 68e7512..cb74bd3 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -79,6 +79,9 @@ class BeautifulSoup(Tag): NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + def __copy__(self): + return type(self)(self.encode(), builder=self.builder) + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 4eaaaec..ab5793c 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -244,9 +244,9 @@ class Element(html5lib.treebuilders._base.Node): def reparentChildren(self, new_parent): """Move all of this tag's children into another tag.""" - print "MOVE", self.element.contents - print "FROM", self.element - print "TO", new_parent.element + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -297,9 +297,9 @@ class Element(html5lib.treebuilders._base.Node): element.contents = [] element.next_element = final_next_element - print "DONE WITH MOVE" - print "FROM", self.element - print "TO", new_parent_element + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) diff --git a/bs4/element.py b/bs4/element.py index 0486da2..c70ad5a 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -815,6 +815,18 @@ class Tag(PageElement): parserClass = _alias("parser_class") # BS3 + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)(None, self.builder, self.name, self.namespace, + self.nsprefix, self.attrs) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 22d4b4f..2371591 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1330,6 +1330,32 @@ class TestPersistence(SoupTest): self.assertEqual(s1, s2) self.assertTrue(isinstance(s2, Comment)) + def test_copy_entire_soup(self): + html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + self.assertEqual(soup, soup_copy) + + def test_copy_tag_copies_contents(self): + html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + self.assertEqual(unicode(div), unicode(div_copy)) + self.assertEqual(div, div_copy) + + # But they're not the same object. + self.assertFalse(div is div_copy) + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + self.assertEqual(None, div_copy.parent) + self.assertEqual(None, div_copy.previous_element) + self.assertEqual(None, div_copy.find(string='Bar').next_element) + self.assertNotEqual(None, div.find(string='Bar').next_element) + class TestSubstitutions(SoupTest): def test_default_formatter_is_minimal(self): diff --git a/doc/source/index.rst b/doc/source/index.rst index f6d3e38..81659ed 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1787,7 +1787,6 @@ attributes, and delete attributes:: tag # <blockquote>Extremely bold</blockquote> - Modifying ``.string`` --------------------- @@ -2419,8 +2418,9 @@ as ``exclude_encodings``:: soup.original_encoding 'WINDOWS-1255' -(This isn't 100% correct, but Windows-1255 is a compatible superset of -ISO-8859-8, so it's close enough.) +Windows-1255 isn't 100% correct, but that encoding is a compatible +superset of ISO-8859-8, so it's close enough. (``exclude_encodings`` +is a new feature in Beautiful Soup 4.4.0.) In rare cases (usually when a UTF-8 document contains text written in a completely different encoding), the only way to get Unicode may be @@ -2609,6 +2609,62 @@ document is Windows-1252, and the document will come out looking like ``UnicodeDammit.detwingle()`` is new in Beautiful Soup 4.1.0. + +Comparing objects for equality +============================== + +Beautiful Soup says that two ``NavigableString`` or ``Tag`` objects +are equal when they represent the same HTML or XML markup. In this +example, the two <b> tags are treated as equal, even though they live +in different parts of the object tree, because they both look like +"<b>pizza</b>":: + + markup = "<p>I want <b>pizza</b> and more <b>pizza</b>!</p>" + soup = BeautifulSoup(markup, 'html.parser') + first_b, second_b = soup.find_all('b') + print first_b == second_b + # True + + print first_b.previous_element == second_b.previous_element + # False + +If you want to see whether two variables refer to exactly the same +object, use `is`:: + + print first_b is second_b + # False + +Copying Beautiful Soup objects +============================== + +You can use ``copy.copy()`` to create a copy of any ``Tag`` or +``NavigableString``:: + + import copy + p_copy = copy.copy(soup.p) + print p_copy + # <p>I want <b>pizza</b> and more <b>pizza</b>!</p> + +The copy is considered equal to the original, since it represents the +same markup as the original, but it's not the same object:: + + print soup.p == p_copy + # True + + print soup.p is p_copy + # False + +The only real difference is that the copy is completely detached from +the original Beautiful Soup object tree, just as if ``extract()`` had +been called on it:: + + print p_copy.parent + # None + +This is because two different ``Tag`` objects can't occupy the same +space at the same time. + + Parsing only part of a document =============================== |