From 8d0d071f41c59b16e9f4cf361cf08c3359cce05a Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 08:27:39 -0500 Subject: Minor cleanup. --- bs4/builder/_html5lib.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 9897675..42bbdfa 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -128,8 +128,8 @@ class Element(html5lib.treebuilders._base.Node): def _nodeIndex(self, node, refNode): # Finds a node by identity rather than equality - for index in range(len(self.element.contents)): - if id(self.element.contents[index]) == id(refNode.element): + for index, element in enumerate(self.element.contents): + if id(element) == id(refNode.element): return index return None @@ -138,14 +138,11 @@ class Element(html5lib.treebuilders._base.Node): and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node # (TODO: This has O(n^2) performance, for input like "aaa...") - newStr = NavigableString(self.element.contents[-1]+node.element) + newStr = self.soup.new_string( + self.element.contents[-1]+node.element) # Remove the old text node - # (Can't simply use .extract() by itself, because it fails if - # an equal text node exists within the parent node) oldElement = self.element.contents[-1] - del self.element.contents[-1] - oldElement.parent = None oldElement.extract() self.element.insert(len(self.element.contents), newStr) @@ -171,7 +168,7 @@ class Element(html5lib.treebuilders._base.Node): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): - text = TextNode(NavigableString(data), self.soup) + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: self.insertBefore(text, insertBefore) else: @@ -184,8 +181,6 @@ class Element(html5lib.treebuilders._base.Node): # (See comments in appendChild) newStr = NavigableString(self.element.contents[index-1]+node.element) oldNode = self.element.contents[index-1] - del self.element.contents[index-1] - oldNode.parent = None oldNode.extract() self.element.insert(index-1, newStr) @@ -194,14 +189,7 @@ class Element(html5lib.treebuilders._base.Node): node.parent = self def removeChild(self, node): - index = self._nodeIndex(node.parent, node) - # XXX This if statement is problematic: - # https://bugs.launchpad.net/beautifulsoup/+bug/838800 - if index is not None: - del node.parent.element.contents[index] - node.element.parent = None node.element.extract() - node.parent = None def reparentChildren(self, newParent): while self.element.contents: @@ -213,7 +201,8 @@ class Element(html5lib.treebuilders._base.Node): newParent.appendChild(TextNode(child, self.soup)) def cloneNode(self): - node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace) + tag = self.soup.new_tag(self.element.name) + node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value return node -- cgit v1.2.3 From 6675b6b04bb3228bdca3fe70d1c632c7e180fc31 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 10:25:56 -0500 Subject: Tested improvements to html5lib treebuilder. --- bs4/tests/test_html5lib.py | 21 +++++++++++++++++++++ bs4/tests/test_lxml.py | 26 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 5b1d1e4..d257392 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -104,6 +104,27 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): self.assertSoupEquals("
Foo
", "
Foo
") + def test_unclosed_a_tag(self): + # n.b. the whitespace is important here. + markup = """
+ +
+""" + + expect = """
+ +
+""" + self.assertSoupEquals(markup, expect) + def test_incorrectly_nested_tables(self): self.assertSoupEquals( '
', diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 0adef20..e2cb2d2 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -332,6 +332,32 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): '' '
foo
') + + def test_unclosed_a_tag(self): + # tags really ought to be closed at some point. + # + # We have all the
tags because HTML5 says to duplicate + # the tag rather than closing it, and that's what html5lib + # does. + markup = """ +""" + + expect = """
+ +
+
+
+ +
+
""" + self.assertSoupEquals(markup, expect) + def test_unclosed_block_level_elements(self): # Unclosed block-level elements should be closed. self.assertSoupEquals( -- cgit v1.2.3 From 2925f54230552441dda94c8c9e8ef8f8c7ad0f60 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 10:28:10 -0500 Subject: Use append instead of insert. --- bs4/builder/_html5lib.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 42bbdfa..01ed35a 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -144,10 +144,9 @@ class Element(html5lib.treebuilders._base.Node): # Remove the old text node oldElement = self.element.contents[-1] oldElement.extract() - - self.element.insert(len(self.element.contents), newStr) + self.element.append(newStr) else: - self.element.insert(len(self.element.contents), node.element) + self.element.append(node.element) node.parent = self def getAttributes(self): -- cgit v1.2.3 From d8edc821b2bef6dd302ee88782f8c458e06eaf37 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 11:03:44 -0500 Subject: Tested and cleaned up html5lib insertBefore. --- bs4/builder/_html5lib.py | 25 +++++++++++-------------- bs4/tests/test_html5lib.py | 5 +++++ bs4/tests/test_lxml.py | 3 +++ 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 01ed35a..b30a4d3 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -137,14 +137,11 @@ class Element(html5lib.treebuilders._base.Node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node - # (TODO: This has O(n^2) performance, for input like "aaa...") - newStr = self.soup.new_string( - self.element.contents[-1]+node.element) - - # Remove the old text node - oldElement = self.element.contents[-1] - oldElement.extract() - self.element.append(newStr) + # XXX This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + node.element) + old_element.replace_with(new_element) else: self.element.append(node.element) node.parent = self @@ -158,9 +155,11 @@ class Element(html5lib.treebuilders._base.Node): self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. + # # The Tag constructor calls this method automatically, # but html5lib creates a Tag object before setting up - # the attributes. + # its attributes, so Tags are initially created with no + # attributes. self.element.contains_substitutions = ( self.soup.builder.set_up_substitutions( self.element)) @@ -178,11 +177,9 @@ class Element(html5lib.treebuilders._base.Node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): # (See comments in appendChild) - newStr = NavigableString(self.element.contents[index-1]+node.element) - oldNode = self.element.contents[index-1] - oldNode.extract() - - self.element.insert(index-1, newStr) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) else: self.element.insert(index, node.element) node.parent = self diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index d257392..dcbd204 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -131,6 +131,11 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup): ('
' '
')) + def test_floating_text_in_table(self): + self.assertSoupEquals( + "foo
bar
", + "foo
bar
") + def test_empty_element_tag_with_contents(self): self.assertSoupEquals("
foo
", "
foo
") diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index e2cb2d2..359f619 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -381,6 +381,9 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): '
', '
') + def test_floating_text_in_table(self): + self.assertSoupEquals("foo
bar
") + def test_paragraphs_containing_block_display_elements(self): markup = self.soup("

this is the definition:" "

first case
") -- cgit v1.2.3 From 239ec495d5029edd834e8fd7b0d89c64ee52f9bc Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 11:51:11 -0500 Subject: Minor cleanup. --- bs4/builder/_html5lib.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index b30a4d3..a9547cd 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -78,7 +78,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def elementClass(self, name, namespace): if namespace is not None: warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) - return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace) + tag = self.soup.new_tag(name) + return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) @@ -89,10 +90,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return Element(self.soup, self.soup, None) def appendChild(self, node): - self.soup.insert(len(self.soup.contents), node.element) - - def testSerializer(self, element): - return testSerializer(element) + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) def getDocument(self): return self.soup @@ -192,9 +191,11 @@ class Element(html5lib.treebuilders._base.Node): child = self.element.contents[0] child.extract() if isinstance(child, Tag): - newParent.appendChild(Element(child, self.soup, namespaces["html"])) + newParent.appendChild( + Element(child, self.soup, namespaces["html"])) else: - newParent.appendChild(TextNode(child, self.soup)) + newParent.appendChild( + TextNode(child, self.soup)) def cloneNode(self): tag = self.soup.new_tag(self.element.name) -- cgit v1.2.3 From 8d17a394988fb70edbf4cc981ca6799964ae5d52 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 12:01:33 -0500 Subject: Removed _nodeIndex, because the misfeature it works around is now gone. --- bs4/builder/_html5lib.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index a9547cd..dccad83 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -125,13 +125,6 @@ class Element(html5lib.treebuilders._base.Node): self.soup = soup self.namespace = namespace - def _nodeIndex(self, node, refNode): - # Finds a node by identity rather than equality - for index, element in enumerate(self.element.contents): - if id(element) == id(refNode.element): - return index - return None - def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): @@ -172,7 +165,7 @@ class Element(html5lib.treebuilders._base.Node): self.appendChild(text) def insertBefore(self, node, refNode): - index = self._nodeIndex(node, refNode) + index = self.element.index(refNode.element) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): # (See comments in appendChild) -- cgit v1.2.3 From 6e13d74eaabcaf39d261204777763e32172e0ec5 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 12:05:15 -0500 Subject: Clarified comment. --- bs4/builder/_html5lib.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index dccad83..0d7a1a9 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -148,10 +148,8 @@ class Element(html5lib.treebuilders._base.Node): # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. # - # The Tag constructor calls this method automatically, - # but html5lib creates a Tag object before setting up - # its attributes, so Tags are initially created with no - # attributes. + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. self.element.contains_substitutions = ( self.soup.builder.set_up_substitutions( self.element)) -- cgit v1.2.3 From 8cd893c5094e96c7bcdaa735356f4d803210ef34 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 15 Feb 2012 12:06:03 -0500 Subject: Added to NEWS. --- NEWS.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS.txt b/NEWS.txt index d264bff..98535ef 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -1,3 +1,9 @@ += 4.0.0b6 () = + +* Fixed a bug that caused the html5lib tree builder to build a + partially disconnected tree. Generally cleaned up the html5lib tree + builder. + = 4.0.0b5 (20120209) = * Rationalized Beautiful Soup's treatment of CSS class. A tag -- cgit v1.2.3