summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt6
-rw-r--r--bs4/__init__.py1
-rw-r--r--bs4/builder/_html5lib.py64
-rw-r--r--bs4/element.py13
-rw-r--r--bs4/tests/test_html5lib.py13
5 files changed, 84 insertions, 13 deletions
diff --git a/NEWS.txt b/NEWS.txt
index adb957d..904dc07 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,9 @@
+= 4.3.1 (Unreleased) =
+
+* Fixed yet another problem with the html5lib tree builder, caused by
+ html5lib's tendency to rearrange the tree during
+ parsing. [bug=1189267]
+
= 4.3.0 (20130812) =
* Instead of converting incoming data to Unicode and feeding it to the
diff --git a/bs4/__init__.py b/bs4/__init__.py
index ace72f1..75c1aaa 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -279,6 +279,7 @@ class BeautifulSoup(Tag):
parent = parent or self.currentTag
most_recent_element = most_recent_element or self._most_recent_element
o.setup(parent, most_recent_element)
+
if most_recent_element is not None:
most_recent_element.next_element = o
self._most_recent_element = o
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 6ed5055..7de36ae 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -139,6 +139,9 @@ class Element(html5lib.treebuilders._base.Node):
else:
child = node.element
+ if not isinstance(child, basestring) and child.parent is not None:
+ node.element.extract()
+
if (string_child and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
@@ -147,12 +150,23 @@ class Element(html5lib.treebuilders._base.Node):
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + string_child)
old_element.replace_with(new_element)
- self.soup._most_recent_element = new_element
+ self.soup._most_recent_element = new_element
else:
if isinstance(node, basestring):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
- self.soup.object_was_parsed(child, parent=self.element)
+
+ # Tell Beautiful Soup to act as if it parsed this element
+ # immediately after the parent's last descendant. (Or
+ # immediately after the parent, if it has no children.)
+ if self.element.contents:
+ most_recent_element = self.element._last_descendant(False)
+ else:
+ most_recent_element = self.element
+
+ self.soup.object_was_parsed(
+ child, parent=self.element,
+ most_recent_element=most_recent_element)
def getAttributes(self):
return AttrList(self.element)
@@ -201,13 +215,47 @@ class Element(html5lib.treebuilders._base.Node):
def removeChild(self, node):
node.element.extract()
- pass
- def reparentChildren(self, newParent):
- while self.element.contents:
- child = self.element.contents[0]
- child.extract()
- newParent.appendChild(child)
+ def reparentChildren(self, new_parent):
+ """Move all of this tag's children into another tag."""
+ element = self.element
+ new_parent_element = new_parent.element
+ # Determine what this tag's next_element will be once all the children
+ # are removed.
+ final_next_element = element.next_sibling
+
+ new_parents_last_descendant = new_parent_element._last_descendant(False, False)
+ if len(new_parent_element.contents) > 0:
+ # The new parent already contains children. We will be
+ # appending this tag's children to the end.
+ new_parents_last_child = new_parent_element.contents[-1]
+ new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
+ else:
+ # The new parent contains no children.
+ new_parents_last_child = None
+ new_parents_last_descendant_next_element = new_parent_element.next_element
+
+ to_append = element.contents
+ append_after = new_parent.element.contents
+ if len(to_append) > 0:
+ # Set the first child's previous_element and previous_sibling
+ # to elements within the new parent
+ first_child = to_append[0]
+ first_child.previous_element = new_parents_last_descendant
+ first_child.previous_sibling = new_parents_last_child
+
+ # Fix the last child's next_element and next_sibling
+ last_child = to_append[-1]
+ last_child.next_element = new_parents_last_descendant_next_element
+ last_child.next_sibling = None
+
+ for child in to_append:
+ child.parent = new_parent_element
+ new_parent_element.contents.append(child)
+
+ # Now that this element has no children, change its .next_element.
+ element.contents = []
+ element.next_element = final_next_element
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
diff --git a/bs4/element.py b/bs4/element.py
index e10e100..caa855e 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -255,13 +255,16 @@ class PageElement(object):
self.previous_sibling = self.next_sibling = None
return self
- def _last_descendant(self, is_initialized=True):
+ def _last_descendant(self, is_initialized=True, accept_self=True):
"Finds the last element beneath this object to be parsed."
if is_initialized and self.next_sibling:
- return self.next_sibling.previous_element
- last_child = self
- while isinstance(last_child, Tag) and last_child.contents:
- last_child = last_child.contents[-1]
+ last_child = self.next_sibling.previous_element
+ else:
+ last_child = self
+ while isinstance(last_child, Tag) and last_child.contents:
+ last_child = last_child.contents[-1]
+ if not accept_self and last_child == self:
+ last_child = None
return last_child
# BS3: Not part of the API!
_lastRecursiveChild = _last_descendant
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 2a3b41e..594c3e1 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -70,3 +70,16 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
soup = self.soup(markup)
# Verify that we can reach the <p> tag; this means the tree is connected.
self.assertEqual(b"<p>foo</p>", soup.p.encode())
+
+ def test_reparented_markup(self):
+ markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
+ soup = self.soup(markup)
+ self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
+ self.assertEqual(2, len(soup.find_all('p')))
+
+
+ def test_reparented_markup_ends_with_whitespace(self):
+ markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
+ soup = self.soup(markup)
+ self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
+ self.assertEqual(2, len(soup.find_all('p')))