summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2023-04-05 09:48:16 -0400
committerLeonard Richardson <leonardr@segfault.org>2023-04-05 09:48:16 -0400
commit43038bb57e1379795db4363d9715fb238fb67f50 (patch)
tree7940d405ab261e085cf61cbc2a04436c4fc6f10c
parentc8ec7d1e72c9aaa840cde48d3204fdfa1e872232 (diff)
The demonstrate_parser_differences.py script was still written in
Python 2. I've converted it to Python 3, but since no one noticed this problem, it's a sign that no one uses this script and it's not serving its purpose. I may rework or remove it in a later version.
-rw-r--r--CHANGELOG9
-rw-r--r--pyproject.toml1
-rw-r--r--scripts/demonstrate_parser_differences.py75
-rw-r--r--scripts/demonstration_markup.txt34
4 files changed, 62 insertions, 57 deletions
diff --git a/CHANGELOG b/CHANGELOG
index cba16ee..5baac68 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -19,7 +19,9 @@ supported.
object can still cause an overflow.)
* Replaced setup.py and setup.cfg with pyproject.toml. Beautiful Soup
- packages now uses hatch as its build backend.
+ packages now uses hatch as its build backend. This results in some
+ minor changes to the build artifacts, e.g. wheels no longer include
+ the unit tests.
* Added a tox.ini file to make it easier to run the test suite against all
supported versions of Python.
@@ -41,6 +43,11 @@ supported.
* PageElement now implements the known_xml attribute. (This was technically
a bug, but it shouldn't be an issue in normal use.) [bug=2007895]
+* The demonstrate_parser_differences.py script was still written in
+ Python 2. I've converted it to Python 3, but since no one noticed this
+ problem, it's a sign that no one uses this script and it's not
+ serving its purpose. I may rework or remove it in a later version.
+
= 4.12.0 (20230320)
* Introduced the .css property, which centralizes all access to
diff --git a/pyproject.toml b/pyproject.toml
index 8f965d1..7bd0740 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,6 @@ include = [
# Scripts.
"/test-all-versions",
"/scripts/*.py",
- "/scripts/*.txt",
# Documentation source in various languages.
"/doc*/Makefile",
diff --git a/scripts/demonstrate_parser_differences.py b/scripts/demonstrate_parser_differences.py
index d84670a..74c3cad 100644
--- a/scripts/demonstrate_parser_differences.py
+++ b/scripts/demonstrate_parser_differences.py
@@ -7,13 +7,49 @@ parser handles invalid markup differently. Even different versions of
the same parser handle invalid markup differently. So instead of unit
tests I've created this educational demonstration script.
-The file demonstration_markup.txt contains many lines of HTML. This
+The DEMO_MARKUP variable below contains many lines of HTML. This
script tests each line of markup against every parser you have
installed, and prints out how each parser sees that markup. This may
help you choose a parser, or understand why Beautiful Soup presents
your document the way it does.
"""
+DEMO_MARKUP = """A bare string
+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
+<div>A <meta> tag</div>
+<div>A <br> tag that supposedly has contents.</br></div>
+<div>AT&T</div>
+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
+<div><a href="http://example.com/</a> that attribute value never got closed</div>
+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
+<! This document starts with a bogus declaration ><div>a</div>
+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
+<div>This document ends with <!an incomplete declaration
+<div><a style={height:21px;}>That attribute value was bogus</a></div>
+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
+<div>This document ends before the entity finishes: &gt
+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
+<div><table><tr><td>Here's a table</td></tr></table></div>
+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
+<div>This tag contains nothing but whitespace: <b> </b></div>
+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
+<div><table><div>This table contains bare markup</div></table></div>
+<div><div id="1">\\n <a href="link1">This link is never closed.\\n</div>\\n<div id="2">\\n <div id="3">\\n <a href="link2">This link is closed.</a>\\n </div>\\n</div></div>
+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
+<div><our☃>Tag name contains Unicode characters</our☃></div>
+<div><a ☃="snowman">Attribute name contains Unicode characters</a></div>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">"""
+
+from io import StringIO
import os
import sys
from bs4 import BeautifulSoup
@@ -22,13 +58,13 @@ parsers = ['html.parser']
try:
from bs4.builder import _lxml
parsers.append('lxml')
-except ImportError, e:
+except ImportError as e:
pass
try:
from bs4.builder import _html5lib
parsers.append('html5lib')
-except ImportError, e:
+except ImportError as e:
pass
class Demonstration(object):
@@ -47,7 +83,7 @@ class Demonstration(object):
output = soup.div
else:
output = soup
- except Exception, e:
+ except Exception as e:
output = "[EXCEPTION] %s" % str(e)
self.results[parser] = output
if previous_output is None:
@@ -57,39 +93,36 @@ class Demonstration(object):
return uniform_results
def dump(self):
- print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8"))
+ print("%s: %s" % ("Markup".rjust(13), self.markup))
for parser, output in self.results.items():
- print "%s: %s" % (parser.rjust(13), output.encode("utf8"))
+ print("%s: %s" % (parser.rjust(13), output))
different_results = []
uniform_results = []
-print "= Testing the following parsers: %s =" % ", ".join(parsers)
-print
+print("= Testing the following parsers: %s =" % ", ".join(parsers))
+print()
input_file = sys.stdin
if sys.stdin.isatty():
- for filename in [
- "demonstration_markup.txt",
- os.path.join("scripts", "demonstration_markup.txt")]:
- if os.path.exists(filename):
- input_file = open(filename)
+ input_file = StringIO(DEMO_MARKUP)
-for markup in input_file:
- demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n"))
+for markup_line in input_file.readlines():
+ markup = markup_line.strip().replace("\\n", "\n")
+ demo = Demonstration(markup)
is_uniform = demo.run_against(*parsers)
if is_uniform:
uniform_results.append(demo)
else:
different_results.append(demo)
-print "== Markup that's handled the same in every parser =="
-print
+print("== Markup that's handled the same in every parser ==")
+print()
for demo in uniform_results:
demo.dump()
- print
-print "== Markup that's not handled the same in every parser =="
-print
+ print()
+print("== Markup that's not handled the same in every parser ==")
+print()
for demo in different_results:
demo.dump()
- print
+ print()
diff --git a/scripts/demonstration_markup.txt b/scripts/demonstration_markup.txt
deleted file mode 100644
index a7914a0..0000000
--- a/scripts/demonstration_markup.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-A bare string
-<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
-<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
-<div><![CDATA[A CDATA section where it doesn't belong]]></div>
-<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
-<div>A <meta> tag</div>
-<div>A <br> tag that supposedly has contents.</br></div>
-<div>AT&T</div>
-<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
-<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
-<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
-<div><a href="http://example.com/</a> that attribute value never got closed</div>
-<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
-<! This document starts with a bogus declaration ><div>a</div>
-<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
-<div>This document ends with <!an incomplete declaration
-<div><a style={height:21px;}>That attribute value was bogus</a></div>
-<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
-<div><table><td nowrap>That boolean attribute had no value</td></table></div>
-<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
-<div>This document ends before the entity finishes: &gt
-<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
-<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
-<div><table><tr><td>Here's a table</td></tr></table></div>
-<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
-<div>This tag contains nothing but whitespace: <b> </b></div>
-<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
-<div><table><div>This table contains bare markup</div></table></div>
-<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
-<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
-<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
-<div><our☃>Tag name contains Unicode characters</our☃></div>
-<div><a ☃="snowman">Attribute name contains Unicode characters</a></div>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">