From 3d63b9ee9672f3ec48a3043bfe59a578e2537308 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Wed, 22 Feb 2012 13:19:06 -0500 Subject: Added scripts. --- scripts/demo_differences.py | 55 +++++++++++++++++++++++++++++++++++++++++++++ scripts/differences.txt | 34 ++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 scripts/demo_differences.py create mode 100644 scripts/differences.txt (limited to 'scripts') diff --git a/scripts/demo_differences.py b/scripts/demo_differences.py new file mode 100644 index 0000000..c544ea1 --- /dev/null +++ b/scripts/demo_differences.py @@ -0,0 +1,55 @@ +from bs4 import BeautifulSoup + +different_results = [] +uniform_results = [] + +class Demonstration(object): + def __init__(self, markup): + self.results = {} + self.markup = markup + + def run_against(self, *parser_names): + uniform_results = True + previous_output = None + for parser in parser_names: + try: + soup = BeautifulSoup(self.markup, parser) + if markup.startswith("
"): + # Extract the interesting part + output = soup.div + else: + output = soup + except Exception, e: + output = "[EXCEPTION] %s" % str(e) + self.results[parser] = output + if previous_output is None: + previous_output = output + elif previous_output != output: + uniform_results = False + return uniform_results + + def dump(self): + print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) + for parser, output in self.results.items(): + print "%s: %s" % (parser.rjust(13), output.encode("utf8")) + + +for markup in open("differences.txt"): + demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) + is_uniform = demo.run_against("html.parser", "lxml", "html5lib") + if is_uniform: + uniform_results.append(demo) + else: + different_results.append(demo) + +print "Markup that's handled the same in every parser:" +for demo in uniform_results: + demo.dump() + print "-" * 80 +print +print "=" * 80 +print +print "Markup that's not handled the same in every parser:" +for demo in different_results: + demo.dump() + print "-" * 80 diff --git a/scripts/differences.txt b/scripts/differences.txt new file mode 100644 index 0000000..a7914a0 --- /dev/null +++ b/scripts/differences.txt @@ -0,0 +1,34 @@ +A bare string + + +
+
HTML5 does allow CDATA sections in SVG
+
A tag
+
A
tag that supposedly has contents.
+
AT&T
+
+
+
This numeric entity is missing the final semicolon:
+
, that attribute value was closed by the subsequent tag
+
a
+
This document contains (do you see it?)
+
This document ends with That attribute value was bogus
+The doctype is invalid because it contains extra whitespace +
That boolean attribute had no value
+
Here's a nonexistent entity: &#foo; (do you see it?)
+
This document ends before the entity finishes: > +

Paragraphs shouldn't contain block display elements, but this one does:

you see?

+Multiple values for the same attribute. +
Here's a table
+
+
This tag contains nothing but whitespace:
+

This p tag is cut off by

the end of the blockquote tag
+
Here's a nested table:
foo
This table contains bare markup
+ +
This document contains a surprise doctype
+ +
Tag name contains Unicode characters
+ + -- cgit v1.2.3 From 7e2cea6ebc5ae65f3faa5fc764e7ef51019e2a9d Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Thu, 23 Feb 2012 08:00:47 -0500 Subject: Cleaned up script and added it to the MANIFEST.in. --- scripts/demo_differences.py | 55 ------------------ scripts/demonstrate_parser_differences.py | 95 +++++++++++++++++++++++++++++++ scripts/demonstration_markup.txt | 34 +++++++++++ scripts/differences.txt | 34 ----------- 4 files changed, 129 insertions(+), 89 deletions(-) delete mode 100644 scripts/demo_differences.py create mode 100644 scripts/demonstrate_parser_differences.py create mode 100644 scripts/demonstration_markup.txt delete mode 100644 scripts/differences.txt (limited to 'scripts') diff --git a/scripts/demo_differences.py b/scripts/demo_differences.py deleted file mode 100644 index c544ea1..0000000 --- a/scripts/demo_differences.py +++ /dev/null @@ -1,55 +0,0 @@ -from bs4 import BeautifulSoup - -different_results = [] -uniform_results = [] - -class Demonstration(object): - def __init__(self, markup): - self.results = {} - self.markup = markup - - def run_against(self, *parser_names): - uniform_results = True - previous_output = None - for parser in parser_names: - try: - soup = BeautifulSoup(self.markup, parser) - if markup.startswith("
"): - # Extract the interesting part - output = soup.div - else: - output = soup - except Exception, e: - output = "[EXCEPTION] %s" % str(e) - self.results[parser] = output - if previous_output is None: - previous_output = output - elif previous_output != output: - uniform_results = False - return uniform_results - - def dump(self): - print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) - for parser, output in self.results.items(): - print "%s: %s" % (parser.rjust(13), output.encode("utf8")) - - -for markup in open("differences.txt"): - demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) - is_uniform = demo.run_against("html.parser", "lxml", "html5lib") - if is_uniform: - uniform_results.append(demo) - else: - different_results.append(demo) - -print "Markup that's handled the same in every parser:" -for demo in uniform_results: - demo.dump() - print "-" * 80 -print -print "=" * 80 -print -print "Markup that's not handled the same in every parser:" -for demo in different_results: - demo.dump() - print "-" * 80 diff --git a/scripts/demonstrate_parser_differences.py b/scripts/demonstrate_parser_differences.py new file mode 100644 index 0000000..d84670a --- /dev/null +++ b/scripts/demonstrate_parser_differences.py @@ -0,0 +1,95 @@ +"""Demonstrate how different parsers parse the same markup. + +Beautiful Soup can use any of a number of different parsers. Every +parser should behave more or less the same on valid markup, and +Beautiful Soup's unit tests make sure this is the case. But every +parser handles invalid markup differently. Even different versions of +the same parser handle invalid markup differently. So instead of unit +tests I've created this educational demonstration script. + +The file demonstration_markup.txt contains many lines of HTML. This +script tests each line of markup against every parser you have +installed, and prints out how each parser sees that markup. This may +help you choose a parser, or understand why Beautiful Soup presents +your document the way it does. +""" + +import os +import sys +from bs4 import BeautifulSoup +parsers = ['html.parser'] + +try: + from bs4.builder import _lxml + parsers.append('lxml') +except ImportError, e: + pass + +try: + from bs4.builder import _html5lib + parsers.append('html5lib') +except ImportError, e: + pass + +class Demonstration(object): + def __init__(self, markup): + self.results = {} + self.markup = markup + + def run_against(self, *parser_names): + uniform_results = True + previous_output = None + for parser in parser_names: + try: + soup = BeautifulSoup(self.markup, parser) + if markup.startswith("
"): + # Extract the interesting part + output = soup.div + else: + output = soup + except Exception, e: + output = "[EXCEPTION] %s" % str(e) + self.results[parser] = output + if previous_output is None: + previous_output = output + elif previous_output != output: + uniform_results = False + return uniform_results + + def dump(self): + print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) + for parser, output in self.results.items(): + print "%s: %s" % (parser.rjust(13), output.encode("utf8")) + +different_results = [] +uniform_results = [] + +print "= Testing the following parsers: %s =" % ", ".join(parsers) +print + +input_file = sys.stdin +if sys.stdin.isatty(): + for filename in [ + "demonstration_markup.txt", + os.path.join("scripts", "demonstration_markup.txt")]: + if os.path.exists(filename): + input_file = open(filename) + +for markup in input_file: + demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) + is_uniform = demo.run_against(*parsers) + if is_uniform: + uniform_results.append(demo) + else: + different_results.append(demo) + +print "== Markup that's handled the same in every parser ==" +print +for demo in uniform_results: + demo.dump() + print +print "== Markup that's not handled the same in every parser ==" +print +for demo in different_results: + demo.dump() + print diff --git a/scripts/demonstration_markup.txt b/scripts/demonstration_markup.txt new file mode 100644 index 0000000..a7914a0 --- /dev/null +++ b/scripts/demonstration_markup.txt @@ -0,0 +1,34 @@ +A bare string + + +
+
HTML5 does allow CDATA sections in SVG
+
A tag
+
A
tag that supposedly has contents.
+
AT&T
+
+
+
This numeric entity is missing the final semicolon:
+ +
a
+
This document contains (do you see it?)
+
This document ends with That attribute value was bogus
+The doctype is invalid because it contains extra whitespace +
That boolean attribute had no value
+
Here's a nonexistent entity: &#foo; (do you see it?)
+
This document ends before the entity finishes: > +

Paragraphs shouldn't contain block display elements, but this one does:

you see?

+Multiple values for the same attribute. +
Here's a table
+
+
This tag contains nothing but whitespace:
+

This p tag is cut off by

the end of the blockquote tag
+
Here's a nested table:
foo
This table contains bare markup
+ +
This document contains a surprise doctype
+ +
Tag name contains Unicode characters
+ + diff --git a/scripts/differences.txt b/scripts/differences.txt deleted file mode 100644 index a7914a0..0000000 --- a/scripts/differences.txt +++ /dev/null @@ -1,34 +0,0 @@ -A bare string - - -
-
HTML5 does allow CDATA sections in SVG
-
A tag
-
A
tag that supposedly has contents.
-
AT&T
-
-
-
This numeric entity is missing the final semicolon:
- -
a
-
This document contains (do you see it?)
-
This document ends with That attribute value was bogus
-The doctype is invalid because it contains extra whitespace -
That boolean attribute had no value
-
Here's a nonexistent entity: &#foo; (do you see it?)
-
This document ends before the entity finishes: > -

Paragraphs shouldn't contain block display elements, but this one does:

you see?

-Multiple values for the same attribute. -
Here's a table
-
-
This tag contains nothing but whitespace:
-

This p tag is cut off by

the end of the blockquote tag
-
Here's a nested table:
foo
This table contains bare markup
- -
This document contains a surprise doctype
- -
Tag name contains Unicode characters
- - -- cgit v1.2.3