Added raw html5lib to the list of parsers that get tested.

author: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-03 09:54:49 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2013-06-03 09:54:49 -0400
commit: aff6cac088db63a65415f2d239e9c8bf07001e73 (patch)
tree: 24d832c97f1cdd835d2ab1ed81ba83e94d360623
parent: 74380c3685d0fe730bcb06e63d8591e65b557df5 (diff)
2 files changed, 12 insertions, 1 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 7b801c9..dcfb733 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -19,6 +19,9 @@
 * Beautiful Soup will issue a warning if instead of markup you pass it
   a URL or the name of a file on disk (a common beginner mistake).
 
+* A number of optimizations improve the performance of the lxml tree
+  builder by about 33%, and the html.parser tree builder by about 20%.
+
 = 4.2.1 (20130531) =
 
 * The default XML formatter will now replace ampersands even if they
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
index a2b405b..ad79d8a 100644
--- a/bs4/diagnose.py
+++ b/bs4/diagnose.py
@@ -179,6 +179,13 @@ def benchmark_parsers(num_elements=100000):
     b = time.time()
     print "Raw lxml parsed the markup in %.2fs." % (b-a)
 
+    import html5lib
+    parser = html5lib.HTMLParser()
+    a = time.time()
+    parser.parse(data)
+    b = time.time()
+    print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+
 def profile(num_elements=100000, parser="lxml"):
 
     filehandle = tempfile.NamedTemporaryFile()
@@ -196,4 +203,5 @@ def profile(num_elements=100000, parser="lxml"):
 
 if __name__ == '__main__':
     #diagnose(sys.stdin.read())
-    profile(parser="lxml")
+    profile(1000, parser="html5lib")
+    # benchmark_parsers()
author	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-03 09:54:49 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2013-06-03 09:54:49 -0400
commit	aff6cac088db63a65415f2d239e9c8bf07001e73 (patch)
tree	24d832c97f1cdd835d2ab1ed81ba83e94d360623
parent	74380c3685d0fe730bcb06e63d8591e65b557df5 (diff)