summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/builder/_html5lib.py71
-rw-r--r--bs4/tests/test_html5lib.py26
-rw-r--r--bs4/tests/test_lxml.py29
3 files changed, 79 insertions, 47 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 9897675..0d7a1a9 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -78,7 +78,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def elementClass(self, name, namespace):
if namespace is not None:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
- return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
+ tag = self.soup.new_tag(name)
+ return Element(tag, self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
@@ -89,10 +90,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return Element(self.soup, self.soup, None)
def appendChild(self, node):
- self.soup.insert(len(self.soup.contents), node.element)
-
- def testSerializer(self, element):
- return testSerializer(element)
+ # XXX This code is not covered by the BS4 tests.
+ self.soup.append(node.element)
def getDocument(self):
return self.soup
@@ -126,31 +125,17 @@ class Element(html5lib.treebuilders._base.Node):
self.soup = soup
self.namespace = namespace
- def _nodeIndex(self, node, refNode):
- # Finds a node by identity rather than equality
- for index in range(len(self.element.contents)):
- if id(self.element.contents[index]) == id(refNode.element):
- return index
- return None
-
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# Concatenate new text onto old text node
- # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
- newStr = NavigableString(self.element.contents[-1]+node.element)
-
- # Remove the old text node
- # (Can't simply use .extract() by itself, because it fails if
- # an equal text node exists within the parent node)
- oldElement = self.element.contents[-1]
- del self.element.contents[-1]
- oldElement.parent = None
- oldElement.extract()
-
- self.element.insert(len(self.element.contents), newStr)
+ # XXX This has O(n^2) performance, for input like
+ # "a</a>a</a>a</a>..."
+ old_element = self.element.contents[-1]
+ new_element = self.soup.new_string(old_element + node.element)
+ old_element.replace_with(new_element)
else:
- self.element.insert(len(self.element.contents), node.element)
+ self.element.append(node.element)
node.parent = self
def getAttributes(self):
@@ -162,58 +147,50 @@ class Element(html5lib.treebuilders._base.Node):
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
- # The Tag constructor calls this method automatically,
- # but html5lib creates a Tag object before setting up
- # the attributes.
+ #
+ # The Tag constructor called this method when the Tag was created,
+ # but we just set/changed the attributes, so call it again.
self.element.contains_substitutions = (
self.soup.builder.set_up_substitutions(
self.element))
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
- text = TextNode(NavigableString(data), self.soup)
+ text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
- index = self._nodeIndex(node, refNode)
+ index = self.element.index(refNode.element)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
- newStr = NavigableString(self.element.contents[index-1]+node.element)
- oldNode = self.element.contents[index-1]
- del self.element.contents[index-1]
- oldNode.parent = None
- oldNode.extract()
-
- self.element.insert(index-1, newStr)
+ old_node = self.element.contents[index-1]
+ new_str = self.soup.new_string(old_node + node.element)
+ old_node.replace_with(new_str)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
- index = self._nodeIndex(node.parent, node)
- # XXX This if statement is problematic:
- # https://bugs.launchpad.net/beautifulsoup/+bug/838800
- if index is not None:
- del node.parent.element.contents[index]
- node.element.parent = None
node.element.extract()
- node.parent = None
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
- newParent.appendChild(Element(child, self.soup, namespaces["html"]))
+ newParent.appendChild(
+ Element(child, self.soup, namespaces["html"]))
else:
- newParent.appendChild(TextNode(child, self.soup))
+ newParent.appendChild(
+ TextNode(child, self.soup))
def cloneNode(self):
- node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace)
+ tag = self.soup.new_tag(self.element.name)
+ node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 5b1d1e4..dcbd204 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -104,12 +104,38 @@ class TestHTML5BuilderInvalidMarkup(TestLXMLBuilderInvalidMarkup):
self.assertSoupEquals("<table><div>Foo</div></table>",
"<div>Foo</div><table></table>")
+ def test_unclosed_a_tag(self):
+ # n.b. the whitespace is important here.
+ markup = """<div id="1">
+ <a href="foo">
+</div>
+<div id="2">
+ <div id="3">
+ <a href="bar"></a>
+ </div>
+</div>"""
+
+ expect = """<div id="1">
+ <a href="foo">
+</a></div><a href="foo">
+</a><div id="2"><a href="foo">
+ </a><div id="3"><a href="foo">
+ </a><a href="bar"></a>
+ </div>
+</div>"""
+ self.assertSoupEquals(markup, expect)
+
def test_incorrectly_nested_tables(self):
self.assertSoupEquals(
'<table><tr><table><tr id="nested">',
('<table><tbody><tr></tr></tbody></table>'
'<table><tbody><tr id="nested"></tr></tbody></table>'))
+ def test_floating_text_in_table(self):
+ self.assertSoupEquals(
+ "<table><td></td>foo<td>bar</td></table>",
+ "foo<table><tbody><tr><td></td><td>bar</td></tr></tbody></table>")
+
def test_empty_element_tag_with_contents(self):
self.assertSoupEquals("<br>foo</br>", "<br />foo<br />")
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 0adef20..359f619 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -332,6 +332,32 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
'<tr><td>foo</td></tr>'
'</table></td>')
+
+ def test_unclosed_a_tag(self):
+ # <a> tags really ought to be closed at some point.
+ #
+ # We have all the <div> tags because HTML5 says to duplicate
+ # the <a> tag rather than closing it, and that's what html5lib
+ # does.
+ markup = """<div id="1">
+ <a href="foo">
+</div>
+<div id="2">
+ <div id="3">
+ <a href="bar"></a>
+ </div>
+</div>"""
+
+ expect = """<div id="1">
+<a href="foo">
+</a></div>
+<div id="2">
+<div id="3">
+<a href="bar"></a>
+</div>
+</div>"""
+ self.assertSoupEquals(markup, expect)
+
def test_unclosed_block_level_elements(self):
# Unclosed block-level elements should be closed.
self.assertSoupEquals(
@@ -355,6 +381,9 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
'<table><tr><table><tr id="nested">',
'<table><tr><table><tr id="nested"></tr></table></tr></table>')
+ def test_floating_text_in_table(self):
+ self.assertSoupEquals("<table><td></td>foo<td>bar</td></table>")
+
def test_paragraphs_containing_block_display_elements(self):
markup = self.soup("<p>this is the definition:"
"<dl><dt>first case</dt>")