blob: c544ea1e2c5165f70e24dab905e862983ae8ceff (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
from bs4 import BeautifulSoup
different_results = []
uniform_results = []
class Demonstration(object):
def __init__(self, markup):
self.results = {}
self.markup = markup
def run_against(self, *parser_names):
uniform_results = True
previous_output = None
for parser in parser_names:
try:
soup = BeautifulSoup(self.markup, parser)
if markup.startswith("<div>"):
# Extract the interesting part
output = soup.div
else:
output = soup
except Exception, e:
output = "[EXCEPTION] %s" % str(e)
self.results[parser] = output
if previous_output is None:
previous_output = output
elif previous_output != output:
uniform_results = False
return uniform_results
def dump(self):
print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
for parser, output in self.results.items():
print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
for markup in open("differences.txt"):
demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
is_uniform = demo.run_against("html.parser", "lxml", "html5lib")
if is_uniform:
uniform_results.append(demo)
else:
different_results.append(demo)
print "Markup that's handled the same in every parser:"
for demo in uniform_results:
demo.dump()
print "-" * 80
print
print "=" * 80
print
print "Markup that's not handled the same in every parser:"
for demo in different_results:
demo.dump()
print "-" * 80
|