summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-02-23 08:00:47 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-02-23 08:00:47 -0500
commit7e2cea6ebc5ae65f3faa5fc764e7ef51019e2a9d (patch)
treeccebf04e9acb1820c2c5a3991d9cd01bfc857f85 /scripts
parent3d63b9ee9672f3ec48a3043bfe59a578e2537308 (diff)
Cleaned up script and added it to the MANIFEST.in.
Diffstat (limited to 'scripts')
-rw-r--r--scripts/demo_differences.py55
-rw-r--r--scripts/demonstrate_parser_differences.py95
-rw-r--r--scripts/demonstration_markup.txt (renamed from scripts/differences.txt)0
3 files changed, 95 insertions, 55 deletions
diff --git a/scripts/demo_differences.py b/scripts/demo_differences.py
deleted file mode 100644
index c544ea1..0000000
--- a/scripts/demo_differences.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from bs4 import BeautifulSoup
-
-different_results = []
-uniform_results = []
-
-class Demonstration(object):
- def __init__(self, markup):
- self.results = {}
- self.markup = markup
-
- def run_against(self, *parser_names):
- uniform_results = True
- previous_output = None
- for parser in parser_names:
- try:
- soup = BeautifulSoup(self.markup, parser)
- if markup.startswith("<div>"):
- # Extract the interesting part
- output = soup.div
- else:
- output = soup
- except Exception, e:
- output = "[EXCEPTION] %s" % str(e)
- self.results[parser] = output
- if previous_output is None:
- previous_output = output
- elif previous_output != output:
- uniform_results = False
- return uniform_results
-
- def dump(self):
- print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
- for parser, output in self.results.items():
- print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
-
-
-for markup in open("differences.txt"):
- demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
- is_uniform = demo.run_against("html.parser", "lxml", "html5lib")
- if is_uniform:
- uniform_results.append(demo)
- else:
- different_results.append(demo)
-
-print "Markup that's handled the same in every parser:"
-for demo in uniform_results:
- demo.dump()
- print "-" * 80
-print
-print "=" * 80
-print
-print "Markup that's not handled the same in every parser:"
-for demo in different_results:
- demo.dump()
- print "-" * 80
diff --git a/scripts/demonstrate_parser_differences.py b/scripts/demonstrate_parser_differences.py
new file mode 100644
index 0000000..d84670a
--- /dev/null
+++ b/scripts/demonstrate_parser_differences.py
@@ -0,0 +1,95 @@
+"""Demonstrate how different parsers parse the same markup.
+
+Beautiful Soup can use any of a number of different parsers. Every
+parser should behave more or less the same on valid markup, and
+Beautiful Soup's unit tests make sure this is the case. But every
+parser handles invalid markup differently. Even different versions of
+the same parser handle invalid markup differently. So instead of unit
+tests I've created this educational demonstration script.
+
+The file demonstration_markup.txt contains many lines of HTML. This
+script tests each line of markup against every parser you have
+installed, and prints out how each parser sees that markup. This may
+help you choose a parser, or understand why Beautiful Soup presents
+your document the way it does.
+"""
+
+import os
+import sys
+from bs4 import BeautifulSoup
+parsers = ['html.parser']
+
+try:
+ from bs4.builder import _lxml
+ parsers.append('lxml')
+except ImportError, e:
+ pass
+
+try:
+ from bs4.builder import _html5lib
+ parsers.append('html5lib')
+except ImportError, e:
+ pass
+
+class Demonstration(object):
+ def __init__(self, markup):
+ self.results = {}
+ self.markup = markup
+
+ def run_against(self, *parser_names):
+ uniform_results = True
+ previous_output = None
+ for parser in parser_names:
+ try:
+ soup = BeautifulSoup(self.markup, parser)
+ if markup.startswith("<div>"):
+ # Extract the interesting part
+ output = soup.div
+ else:
+ output = soup
+ except Exception, e:
+ output = "[EXCEPTION] %s" % str(e)
+ self.results[parser] = output
+ if previous_output is None:
+ previous_output = output
+ elif previous_output != output:
+ uniform_results = False
+ return uniform_results
+
+ def dump(self):
+ print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
+ for parser, output in self.results.items():
+ print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
+
+different_results = []
+uniform_results = []
+
+print "= Testing the following parsers: %s =" % ", ".join(parsers)
+print
+
+input_file = sys.stdin
+if sys.stdin.isatty():
+ for filename in [
+ "demonstration_markup.txt",
+ os.path.join("scripts", "demonstration_markup.txt")]:
+ if os.path.exists(filename):
+ input_file = open(filename)
+
+for markup in input_file:
+ demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
+ is_uniform = demo.run_against(*parsers)
+ if is_uniform:
+ uniform_results.append(demo)
+ else:
+ different_results.append(demo)
+
+print "== Markup that's handled the same in every parser =="
+print
+for demo in uniform_results:
+ demo.dump()
+ print
+print "== Markup that's not handled the same in every parser =="
+print
+for demo in different_results:
+ demo.dump()
+ print
diff --git a/scripts/differences.txt b/scripts/demonstration_markup.txt
index a7914a0..a7914a0 100644
--- a/scripts/differences.txt
+++ b/scripts/demonstration_markup.txt