summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 10:33:03 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 10:33:03 -0400
commit4cc522e01dae3fab54170060beef1389d528f6b6 (patch)
tree509368036b44969526becf745093fb7327676e02
parentaff6cac088db63a65415f2d239e9c8bf07001e73 (diff)
Improved performance for html5lib.
-rw-r--r--bs4/builder/_html5lib.py36
-rw-r--r--bs4/diagnose.py6
-rw-r--r--bs4/element.py2
3 files changed, 29 insertions, 15 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 3bbc9a9..79f618a 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -123,17 +123,31 @@ class Element(html5lib.treebuilders._base.Node):
self.namespace = namespace
def appendChild(self, node):
- if (node.element.__class__ == NavigableString and self.element.contents
+ string_child = child = None
+ if isinstance(node, basestring):
+ # Some other piece of code decided to pass in a string
+ # instead of creating a TextElement object to contain the
+ # string.
+ string_child = child = node
+ elif node.element.__class__ == NavigableString:
+ string_child = child = node.element
+ else:
+ child = node.element
+
+ if (string_child and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
- # Concatenate new text onto old text node
- # XXX This has O(n^2) performance, for input like
+ # We are appending a string onto another string.
+ # TODO This has O(n^2) performance, for input like
# "a</a>a</a>a</a>..."
old_element = self.element.contents[-1]
- new_element = self.soup.new_string(old_element + node.element)
+ new_element = self.soup.new_string(old_element + string_child)
old_element.replace_with(new_element)
- self.soup._most_recent_element = new_element
+ self.soup._most_recent_element = new_element
else:
- self.soup.object_was_parsed(node.element, parent=self.element)
+ if isinstance(node, basestring):
+ # Create a brand new NavigableString from this string.
+ child = self.soup.new_string(node)
+ self.soup.object_was_parsed(child, parent=self.element)
def getAttributes(self):
return AttrList(self.element)
@@ -162,11 +176,11 @@ class Element(html5lib.treebuilders._base.Node):
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
- text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
- self.insertBefore(text, insertBefore)
+ text = TextNode(self.soup.new_string(data), self.soup)
+ self.insertBefore(data, insertBefore)
else:
- self.appendChild(text)
+ self.appendChild(data)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
@@ -182,6 +196,7 @@ class Element(html5lib.treebuilders._base.Node):
def removeChild(self, node):
node.element.extract()
+ pass
def reparentChildren(self, newParent):
while self.element.contents:
@@ -191,8 +206,7 @@ class Element(html5lib.treebuilders._base.Node):
newParent.appendChild(
Element(child, self.soup, namespaces["html"]))
else:
- newParent.appendChild(
- TextNode(child, self.soup))
+ newParent.appendChild(child)
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
index ad79d8a..bd995c2 100644
--- a/bs4/diagnose.py
+++ b/bs4/diagnose.py
@@ -196,12 +196,12 @@ def profile(num_elements=100000, parser="lxml"):
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
stats = pstats.Stats(filename)
- stats.strip_dirs()
+ # stats.strip_dirs()
cumulative = stats.sort_stats("cumulative")
total = stats.sort_stats("time")
- total.print_stats(50)
+ cumulative.print_stats('_html5lib|bs4', 50)
if __name__ == '__main__':
#diagnose(sys.stdin.read())
- profile(1000, parser="html5lib")
+ profile(10000, parser="html5lib")
# benchmark_parsers()
diff --git a/bs4/element.py b/bs4/element.py
index 7b63b30..40b1631 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -258,7 +258,7 @@ class PageElement(object):
def _last_descendant(self):
"Finds the last element beneath this object to be parsed."
last_child = self
- while hasattr(last_child, 'contents') and last_child.contents:
+ while isinstance(last_child, Tag) and last_child.contents:
last_child = last_child.contents[-1]
return last_child
# BS3: Not part of the API!