diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 09:54:49 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 09:54:49 -0400 |
commit | aff6cac088db63a65415f2d239e9c8bf07001e73 (patch) | |
tree | 24d832c97f1cdd835d2ab1ed81ba83e94d360623 | |
parent | 74380c3685d0fe730bcb06e63d8591e65b557df5 (diff) |
Added raw html5lib to the list of parsers that get tested.
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/diagnose.py | 10 |
2 files changed, 12 insertions, 1 deletions
@@ -19,6 +19,9 @@ * Beautiful Soup will issue a warning if instead of markup you pass it a URL or the name of a file on disk (a common beginner mistake). +* A number of optimizations improve the performance of the lxml tree + builder by about 33%, and the html.parser tree builder by about 20%. + = 4.2.1 (20130531) = * The default XML formatter will now replace ampersands even if they diff --git a/bs4/diagnose.py b/bs4/diagnose.py index a2b405b..ad79d8a 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -179,6 +179,13 @@ def benchmark_parsers(num_elements=100000): b = time.time() print "Raw lxml parsed the markup in %.2fs." % (b-a) + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print "Raw html5lib parsed the markup in %.2fs." % (b-a) + def profile(num_elements=100000, parser="lxml"): filehandle = tempfile.NamedTemporaryFile() @@ -196,4 +203,5 @@ def profile(num_elements=100000, parser="lxml"): if __name__ == '__main__': #diagnose(sys.stdin.read()) - profile(parser="lxml") + profile(1000, parser="html5lib") + # benchmark_parsers() |