From 3d63b9ee9672f3ec48a3043bfe59a578e2537308 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 22 Feb 2012 13:19:06 -0500 Subject: Added scripts. --- scripts/demo_differences.py | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 scripts/demo_differences.py (limited to 'scripts/demo_differences.py') diff --git a/scripts/demo_differences.py b/scripts/demo_differences.py new file mode 100644 index 0000000..c544ea1 --- /dev/null +++ b/scripts/demo_differences.py @@ -0,0 +1,55 @@ +from bs4 import BeautifulSoup + +different_results = [] +uniform_results = [] + +class Demonstration(object): + def __init__(self, markup): + self.results = {} + self.markup = markup + + def run_against(self, *parser_names): + uniform_results = True + previous_output = None + for parser in parser_names: + try: + soup = BeautifulSoup(self.markup, parser) + if markup.startswith("
"): + # Extract the interesting part + output = soup.div + else: + output = soup + except Exception, e: + output = "[EXCEPTION] %s" % str(e) + self.results[parser] = output + if previous_output is None: + previous_output = output + elif previous_output != output: + uniform_results = False + return uniform_results + + def dump(self): + print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) + for parser, output in self.results.items(): + print "%s: %s" % (parser.rjust(13), output.encode("utf8")) + + +for markup in open("differences.txt"): + demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) + is_uniform = demo.run_against("html.parser", "lxml", "html5lib") + if is_uniform: + uniform_results.append(demo) + else: + different_results.append(demo) + +print "Markup that's handled the same in every parser:" +for demo in uniform_results: + demo.dump() + print "-" * 80 +print +print "=" * 80 +print +print "Markup that's not handled the same in every parser:" +for demo in different_results: + demo.dump() + print "-" * 80 -- cgit v1.2.3