diff options
-rw-r--r-- | bs4/builder/_html5lib.py | 36 | ||||
-rw-r--r-- | bs4/diagnose.py | 6 | ||||
-rw-r--r-- | bs4/element.py | 2 |
3 files changed, 29 insertions, 15 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 3bbc9a9..79f618a 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -123,17 +123,31 @@ class Element(html5lib.treebuilders._base.Node): self.namespace = namespace def appendChild(self, node): - if (node.element.__class__ == NavigableString and self.element.contents + string_child = child = None + if isinstance(node, basestring): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + else: + child = node.element + + if (string_child and self.element.contents and self.element.contents[-1].__class__ == NavigableString): - # Concatenate new text onto old text node - # XXX This has O(n^2) performance, for input like + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like # "a</a>a</a>a</a>..." old_element = self.element.contents[-1] - new_element = self.soup.new_string(old_element + node.element) + new_element = self.soup.new_string(old_element + string_child) old_element.replace_with(new_element) - self.soup._most_recent_element = new_element + self.soup._most_recent_element = new_element else: - self.soup.object_was_parsed(node.element, parent=self.element) + if isinstance(node, basestring): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + self.soup.object_was_parsed(child, parent=self.element) def getAttributes(self): return AttrList(self.element) @@ -162,11 +176,11 @@ class Element(html5lib.treebuilders._base.Node): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): - text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: - self.insertBefore(text, insertBefore) + text = TextNode(self.soup.new_string(data), self.soup) + self.insertBefore(data, insertBefore) else: - self.appendChild(text) + self.appendChild(data) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) @@ -182,6 +196,7 @@ class Element(html5lib.treebuilders._base.Node): def removeChild(self, node): node.element.extract() + pass def reparentChildren(self, newParent): while self.element.contents: @@ -191,8 +206,7 @@ class Element(html5lib.treebuilders._base.Node): newParent.appendChild( Element(child, self.soup, namespaces["html"])) else: - newParent.appendChild( - TextNode(child, self.soup)) + newParent.appendChild(child) def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) diff --git a/bs4/diagnose.py b/bs4/diagnose.py index ad79d8a..bd995c2 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -196,12 +196,12 @@ def profile(num_elements=100000, parser="lxml"): cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) stats = pstats.Stats(filename) - stats.strip_dirs() + # stats.strip_dirs() cumulative = stats.sort_stats("cumulative") total = stats.sort_stats("time") - total.print_stats(50) + cumulative.print_stats('_html5lib|bs4', 50) if __name__ == '__main__': #diagnose(sys.stdin.read()) - profile(1000, parser="html5lib") + profile(10000, parser="html5lib") # benchmark_parsers() diff --git a/bs4/element.py b/bs4/element.py index 7b63b30..40b1631 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -258,7 +258,7 @@ class PageElement(object): def _last_descendant(self): "Finds the last element beneath this object to be parsed." last_child = self - while hasattr(last_child, 'contents') and last_child.contents: + while isinstance(last_child, Tag) and last_child.contents: last_child = last_child.contents[-1] return last_child # BS3: Not part of the API! |