diff options
Diffstat (limited to 'bs4/builder/_html5lib.py')
-rw-r--r-- | bs4/builder/_html5lib.py | 71 |
1 files changed, 24 insertions, 47 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 9897675..0d7a1a9 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -78,7 +78,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def elementClass(self, name, namespace): if namespace is not None: warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) - return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace) + tag = self.soup.new_tag(name) + return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) @@ -89,10 +90,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return Element(self.soup, self.soup, None) def appendChild(self, node): - self.soup.insert(len(self.soup.contents), node.element) - - def testSerializer(self, element): - return testSerializer(element) + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) def getDocument(self): return self.soup @@ -126,31 +125,17 @@ class Element(html5lib.treebuilders._base.Node): self.soup = soup self.namespace = namespace - def _nodeIndex(self, node, refNode): - # Finds a node by identity rather than equality - for index in range(len(self.element.contents)): - if id(self.element.contents[index]) == id(refNode.element): - return index - return None - def appendChild(self, node): if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # Concatenate new text onto old text node - # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") - newStr = NavigableString(self.element.contents[-1]+node.element) - - # Remove the old text node - # (Can't simply use .extract() by itself, because it fails if - # an equal text node exists within the parent node) - oldElement = self.element.contents[-1] - del self.element.contents[-1] - oldElement.parent = None - oldElement.extract() - - self.element.insert(len(self.element.contents), newStr) + # XXX This has O(n^2) performance, for input like + # "a</a>a</a>a</a>..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + node.element) + old_element.replace_with(new_element) else: - self.element.insert(len(self.element.contents), node.element) + self.element.append(node.element) node.parent = self def getAttributes(self): @@ -162,58 +147,50 @@ class Element(html5lib.treebuilders._base.Node): self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. - # The Tag constructor calls this method automatically, - # but html5lib creates a Tag object before setting up - # the attributes. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. self.element.contains_substitutions = ( self.soup.builder.set_up_substitutions( self.element)) attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): - text = TextNode(NavigableString(data), self.soup) + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: self.insertBefore(text, insertBefore) else: self.appendChild(text) def insertBefore(self, node, refNode): - index = self._nodeIndex(node, refNode) + index = self.element.index(refNode.element) if (node.element.__class__ == NavigableString and self.element.contents and self.element.contents[index-1].__class__ == NavigableString): # (See comments in appendChild) - newStr = NavigableString(self.element.contents[index-1]+node.element) - oldNode = self.element.contents[index-1] - del self.element.contents[index-1] - oldNode.parent = None - oldNode.extract() - - self.element.insert(index-1, newStr) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) else: self.element.insert(index, node.element) node.parent = self def removeChild(self, node): - index = self._nodeIndex(node.parent, node) - # XXX This if statement is problematic: - # https://bugs.launchpad.net/beautifulsoup/+bug/838800 - if index is not None: - del node.parent.element.contents[index] - node.element.parent = None node.element.extract() - node.parent = None def reparentChildren(self, newParent): while self.element.contents: child = self.element.contents[0] child.extract() if isinstance(child, Tag): - newParent.appendChild(Element(child, self.soup, namespaces["html"])) + newParent.appendChild( + Element(child, self.soup, namespaces["html"])) else: - newParent.appendChild(TextNode(child, self.soup)) + newParent.appendChild( + TextNode(child, self.soup)) def cloneNode(self): - node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace) + tag = self.soup.new_tag(self.element.name) + node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value return node |