from bs4 import BeautifulSoup
different_results = []
uniform_results = []
class Demonstration(object):
def __init__(self, markup):
self.results = {}
self.markup = markup
def run_against(self, *parser_names):
uniform_results = True
previous_output = None
for parser in parser_names:
try:
soup = BeautifulSoup(self.markup, parser)
if markup.startswith("
"):
# Extract the interesting part
output = soup.div
else:
output = soup
except Exception, e:
output = "[EXCEPTION] %s" % str(e)
self.results[parser] = output
if previous_output is None:
previous_output = output
elif previous_output != output:
uniform_results = False
return uniform_results
def dump(self):
print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
for parser, output in self.results.items():
print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
for markup in open("differences.txt"):
demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
is_uniform = demo.run_against("html.parser", "lxml", "html5lib")
if is_uniform:
uniform_results.append(demo)
else:
different_results.append(demo)
print "Markup that's handled the same in every parser:"
for demo in uniform_results:
demo.dump()
print "-" * 80
print
print "=" * 80
print
print "Markup that's not handled the same in every parser:"
for demo in different_results:
demo.dump()
print "-" * 80