From 3d63b9ee9672f3ec48a3043bfe59a578e2537308 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Wed, 22 Feb 2012 13:19:06 -0500
Subject: Added scripts.

---
 scripts/demo_differences.py | 55 +++++++++++++++++++++++++++++++++++++++++++++
 scripts/differences.txt     | 34 ++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 scripts/demo_differences.py
 create mode 100644 scripts/differences.txt

(limited to 'scripts')
diff --git a/scripts/demo_differences.py b/scripts/demo_differences.py
new file mode 100644
index 0000000..c544ea1
--- /dev/null
+++ b/scripts/demo_differences.py
@@ -0,0 +1,55 @@
+from bs4 import BeautifulSoup
+
+different_results = []
+uniform_results = []
+
+class Demonstration(object):
+    def __init__(self, markup):
+        self.results = {}
+        self.markup = markup
+
+    def run_against(self, *parser_names):
+        uniform_results = True
+        previous_output = None
+        for parser in parser_names:
+            try:
+                soup = BeautifulSoup(self.markup, parser)
+                if markup.startswith("<div>"):
+                    # Extract the interesting part
+                    output = soup.div
+                else:
+                    output = soup
+            except Exception, e:
+                output = "[EXCEPTION] %s" % str(e)
+            self.results[parser] = output
+            if previous_output is None:
+                previous_output = output
+            elif previous_output != output:
+                uniform_results = False
+        return uniform_results
+
+    def dump(self):
+        print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
+        for parser, output in self.results.items():
+            print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
+
+
+for markup in open("differences.txt"):
+    demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
+    is_uniform = demo.run_against("html.parser", "lxml", "html5lib")
+    if is_uniform:
+        uniform_results.append(demo)
+    else:
+        different_results.append(demo)
+
+print "Markup that's handled the same in every parser:"
+for demo in uniform_results:
+    demo.dump()
+    print "-" * 80
+print
+print "=" * 80
+print
+print "Markup that's not handled the same in every parser:"
+for demo in different_results:
+    demo.dump()
+    print "-" * 80
diff --git a/scripts/differences.txt b/scripts/differences.txt
new file mode 100644
index 0000000..a7914a0
--- /dev/null
+++ b/scripts/differences.txt
@@ -0,0 +1,34 @@
+A bare string
+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
+<div>A <meta> tag</div>
+<div>A <br> tag that supposedly has contents.</br></div>
+<div>AT&T</div>
+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
+<div><a href="http://example.com/</a> that attribute value never got closed</div>
+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
+<! This document starts with a bogus declaration ><div>a</div>
+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
+<div>This document ends with <!an incomplete declaration
+<div><a style={height:21px;}>That attribute value was bogus</a></div>
+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
+<div>This document ends before the entity finishes: &gt
+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
+<div><table><tr><td>Here's a table</td></tr></table></div>
+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
+<div>This tag contains nothing but whitespace: <b>    </b></div>
+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
+<div><table><div>This table contains bare markup</div></table></div>
+<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n   <a href="link2">This link is closed.</a>\n  </div>\n</div></div>
+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
+<div><our☃>Tag name contains Unicode characters</our☃></div>
+<div><a ☃="snowman">Attribute name contains Unicode characters</a></div>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-- 
cgit v1.2.3


From 7e2cea6ebc5ae65f3faa5fc764e7ef51019e2a9d Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonard.richardson@canonical.com>
Date: Thu, 23 Feb 2012 08:00:47 -0500
Subject: Cleaned up script and added it to the MANIFEST.in.

---
 scripts/demo_differences.py               | 55 ------------------
 scripts/demonstrate_parser_differences.py | 95 +++++++++++++++++++++++++++++++
 scripts/demonstration_markup.txt          | 34 +++++++++++
 scripts/differences.txt                   | 34 -----------
 4 files changed, 129 insertions(+), 89 deletions(-)
 delete mode 100644 scripts/demo_differences.py
 create mode 100644 scripts/demonstrate_parser_differences.py
 create mode 100644 scripts/demonstration_markup.txt
 delete mode 100644 scripts/differences.txt

(limited to 'scripts')

diff --git a/scripts/demo_differences.py b/scripts/demo_differences.py
deleted file mode 100644
index c544ea1..0000000
--- a/scripts/demo_differences.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from bs4 import BeautifulSoup
-
-different_results = []
-uniform_results = []
-
-class Demonstration(object):
-    def __init__(self, markup):
-        self.results = {}
-        self.markup = markup
-
-    def run_against(self, *parser_names):
-        uniform_results = True
-        previous_output = None
-        for parser in parser_names:
-            try:
-                soup = BeautifulSoup(self.markup, parser)
-                if markup.startswith("<div>"):
-                    # Extract the interesting part
-                    output = soup.div
-                else:
-                    output = soup
-            except Exception, e:
-                output = "[EXCEPTION] %s" % str(e)
-            self.results[parser] = output
-            if previous_output is None:
-                previous_output = output
-            elif previous_output != output:
-                uniform_results = False
-        return uniform_results
-
-    def dump(self):
-        print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
-        for parser, output in self.results.items():
-            print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
-
-
-for markup in open("differences.txt"):
-    demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
-    is_uniform = demo.run_against("html.parser", "lxml", "html5lib")
-    if is_uniform:
-        uniform_results.append(demo)
-    else:
-        different_results.append(demo)
-
-print "Markup that's handled the same in every parser:"
-for demo in uniform_results:
-    demo.dump()
-    print "-" * 80
-print
-print "=" * 80
-print
-print "Markup that's not handled the same in every parser:"
-for demo in different_results:
-    demo.dump()
-    print "-" * 80
diff --git a/scripts/demonstrate_parser_differences.py b/scripts/demonstrate_parser_differences.py
new file mode 100644
index 0000000..d84670a
--- /dev/null
+++ b/scripts/demonstrate_parser_differences.py
@@ -0,0 +1,95 @@
+"""Demonstrate how different parsers parse the same markup.
+
+Beautiful Soup can use any of a number of different parsers. Every
+parser should behave more or less the same on valid markup, and
+Beautiful Soup's unit tests make sure this is the case. But every
+parser handles invalid markup differently. Even different versions of
+the same parser handle invalid markup differently. So instead of unit
+tests I've created this educational demonstration script.
+
+The file demonstration_markup.txt contains many lines of HTML. This
+script tests each line of markup against every parser you have
+installed, and prints out how each parser sees that markup. This may
+help you choose a parser, or understand why Beautiful Soup presents
+your document the way it does.
+"""
+
+import os
+import sys
+from bs4 import BeautifulSoup
+parsers = ['html.parser']
+
+try:
+    from bs4.builder import _lxml
+    parsers.append('lxml')
+except ImportError, e:
+    pass
+
+try:
+    from bs4.builder import _html5lib
+    parsers.append('html5lib')
+except ImportError, e:
+    pass
+
+class Demonstration(object):
+    def __init__(self, markup):
+        self.results = {}
+        self.markup = markup
+
+    def run_against(self, *parser_names):
+        uniform_results = True
+        previous_output = None
+        for parser in parser_names:
+            try:
+                soup = BeautifulSoup(self.markup, parser)
+                if markup.startswith("<div>"):
+                    # Extract the interesting part
+                    output = soup.div
+                else:
+                    output = soup
+            except Exception, e:
+                output = "[EXCEPTION] %s" % str(e)
+            self.results[parser] = output
+            if previous_output is None:
+                previous_output = output
+            elif previous_output != output:
+                uniform_results = False
+        return uniform_results
+
+    def dump(self):
+        print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
+        for parser, output in self.results.items():
+            print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
+
+different_results = []
+uniform_results = []
+
+print "= Testing the following parsers: %s =" % ", ".join(parsers)
+print
+
+input_file = sys.stdin
+if sys.stdin.isatty():
+    for filename in [
+        "demonstration_markup.txt",
+        os.path.join("scripts", "demonstration_markup.txt")]:
+        if os.path.exists(filename):
+            input_file = open(filename)
+
+for markup in input_file:
+    demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
+    is_uniform = demo.run_against(*parsers)
+    if is_uniform:
+        uniform_results.append(demo)
+    else:
+        different_results.append(demo)
+
+print "== Markup that's handled the same in every parser =="
+print
+for demo in uniform_results:
+    demo.dump()
+    print
+print "== Markup that's not handled the same in every parser =="
+print
+for demo in different_results:
+    demo.dump()
+    print
diff --git a/scripts/demonstration_markup.txt b/scripts/demonstration_markup.txt
new file mode 100644
index 0000000..a7914a0
--- /dev/null
+++ b/scripts/demonstration_markup.txt
@@ -0,0 +1,34 @@
+A bare string
+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
+<div>A <meta> tag</div>
+<div>A <br> tag that supposedly has contents.</br></div>
+<div>AT&T</div>
+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
+<div><a href="http://example.com/</a> that attribute value never got closed</div>
+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
+<! This document starts with a bogus declaration ><div>a</div>
+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
+<div>This document ends with <!an incomplete declaration
+<div><a style={height:21px;}>That attribute value was bogus</a></div>
+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
+<div>This document ends before the entity finishes: &gt
+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
+<div><table><tr><td>Here's a table</td></tr></table></div>
+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
+<div>This tag contains nothing but whitespace: <b>    </b></div>
+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
+<div><table><div>This table contains bare markup</div></table></div>
+<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n   <a href="link2">This link is closed.</a>\n  </div>\n</div></div>
+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
+<div><our☃>Tag name contains Unicode characters</our☃></div>
+<div><a ☃="snowman">Attribute name contains Unicode characters</a></div>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
diff --git a/scripts/differences.txt b/scripts/differences.txt
deleted file mode 100644
index a7914a0..0000000
--- a/scripts/differences.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-A bare string
-<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
-<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
-<div><![CDATA[A CDATA section where it doesn't belong]]></div>
-<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
-<div>A <meta> tag</div>
-<div>A <br> tag that supposedly has contents.</br></div>
-<div>AT&T</div>
-<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
-<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
-<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
-<div><a href="http://example.com/</a> that attribute value never got closed</div>
-<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
-<! This document starts with a bogus declaration ><div>a</div>
-<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
-<div>This document ends with <!an incomplete declaration
-<div><a style={height:21px;}>That attribute value was bogus</a></div>
-<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
-<div><table><td nowrap>That boolean attribute had no value</td></table></div>
-<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
-<div>This document ends before the entity finishes: &gt
-<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
-<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
-<div><table><tr><td>Here's a table</td></tr></table></div>
-<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
-<div>This tag contains nothing but whitespace: <b>    </b></div>
-<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
-<div><table><div>This table contains bare markup</div></table></div>
-<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n   <a href="link2">This link is closed.</a>\n  </div>\n</div></div>
-<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
-<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
-<div><our☃>Tag name contains Unicode characters</our☃></div>
-<div><a ☃="snowman">Attribute name contains Unicode characters</a></div>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-- 
cgit v1.2.3