summaryrefslogtreecommitdiff
path: root/bs4/tests
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/tests')
-rw-r--r--bs4/tests/test_html5lib.py36
-rw-r--r--bs4/tests/test_htmlparser.py32
-rw-r--r--bs4/tests/test_lxml.py2
-rw-r--r--bs4/tests/test_soup.py45
4 files changed, 114 insertions, 1 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 7b0a6d4..2adebc8 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -188,3 +188,39 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
# because there's no way of knowing, when a string is created,
# where in the tree it will eventually end up.
pass
+
+ def test_html5_attributes(self):
+ # The html5lib TreeBuilder can convert any entity named in
+ # the HTML5 spec to a sequence of Unicode characters, and
+ # convert those Unicode characters to a (potentially
+ # different) named entity on the way out.
+ #
+ # This is a copy of the same test from
+ # HTMLParserTreeBuilderSmokeTest. It's not in the superclass
+ # because the lxml HTML TreeBuilder _doesn't_ work this way.
+ for input_element, output_unicode, output_element in (
+ ("⇄", u'\u21c4', b'⇄'),
+ ('⊧', u'\u22a7', b'⊧'),
+ ('𝔑', u'\U0001d511', b'𝔑'),
+ ('≧̸', u'\u2267\u0338', b'≧̸'),
+ ('¬', u'\xac', b'¬'),
+ ('⫬', u'\u2aec', b'⫬'),
+ ('"', u'"', b'"'),
+ ('∴', u'\u2234', b'∴'),
+ ('∴', u'\u2234', b'∴'),
+ ('∴', u'\u2234', b'∴'),
+ ("fj", u'fj', b'fj'),
+ ("⊔", u'\u2294', b'⊔'),
+ ("⊔︀", u'\u2294\ufe00', b'⊔︀'),
+ ("'", u"'", b"'"),
+ ("|", u"|", b"|"),
+ ):
+ markup = u'<div>%s</div>' % input_element
+ div = self.soup(markup).div
+ without_element = div.encode()
+ expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+ self.assertEquals(without_element, expect)
+
+ with_element = div.encode(formatter="html")
+ expect = b"<div>%s</div>" % output_element
+ self.assertEquals(with_element, expect)
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index db85d2d..e84eced 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -88,6 +88,38 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
attrs[key].append(value)
assert_attribute(accumulate, ["url1", "url2", "url3"])
+ def test_html5_attributes(self):
+ # The html.parser TreeBuilder can convert any entity named in
+ # the HTML5 spec to a sequence of Unicode characters, and
+ # convert those Unicode characters to a (potentially
+ # different) named entity on the way out.
+ for input_element, output_unicode, output_element in (
+ ("&RightArrowLeftArrow;", u'\u21c4', b'&rlarr;'),
+ ('&models;', u'\u22a7', b'&models;'),
+ ('&Nfr;', u'\U0001d511', b'&Nfr;'),
+ ('&ngeqq;', u'\u2267\u0338', b'&ngeqq;'),
+ ('&not;', u'\xac', b'&not;'),
+ ('&Not;', u'\u2aec', b'&Not;'),
+ ('&quot;', u'"', b'"'),
+ ('&there4;', u'\u2234', b'&there4;'),
+ ('&Therefore;', u'\u2234', b'&there4;'),
+ ('&therefore;', u'\u2234', b'&there4;'),
+ ("&fjlig;", u'fj', b'fj'),
+ ("&sqcup;", u'\u2294', b'&sqcup;'),
+ ("&sqcups;", u'\u2294\ufe00', b'&sqcups;'),
+ ("&apos;", u"'", b"'"),
+ ("&verbar;", u"|", b"|"),
+ ):
+ markup = u'<div>%s</div>' % input_element
+ div = self.soup(markup).div
+ without_element = div.encode()
+ expect = b"<div>%s</div>" % output_unicode.encode("utf8")
+ self.assertEquals(without_element, expect)
+
+ with_element = div.encode(formatter="html")
+ expect = b"<div>%s</div>" % output_element
+ self.assertEquals(with_element, expect)
+
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index f96e4ae..d8dada4 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
-
+
def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index ddb6446..9074bdb 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -368,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
self.assertEqual(self.sub.substitute_html(dammit.markup),
"&lsquo;&rsquo;foo&ldquo;&rdquo;")
+ def test_html5_entity(self):
+ # Some HTML5 entities correspond to single- or multi-character
+ # Unicode sequences.
+
+ for entity, u in (
+ # A few spot checks of our ability to recognize
+ # special character sequences and convert them
+ # to named entities.
+ ('&models;', u'\u22a7'),
+ ('&Nfr;', u'\U0001d511'),
+ ('&ngeqq;', u'\u2267\u0338'),
+ ('&not;', u'\xac'),
+ ('&Not;', u'\u2aec'),
+
+ # We _could_ convert | to &verbarr;, but we don't, because
+ # | is an ASCII character.
+ ('|' '|'),
+
+ # Similarly for the fj ligature, which we could convert to
+ # &fjlig;, but we don't.
+ ("fj", "fj"),
+
+ # We do convert _these_ ASCII characters to HTML entities,
+ # because that's required to generate valid HTML.
+ ('&gt;', '>'),
+ ('&lt;', '<'),
+ ('&amp;', '&'),
+ ):
+ template = u'3 %s 4'
+ raw = template % u
+ with_entities = template % entity
+ self.assertEqual(self.sub.substitute_html(raw), with_entities)
+
+ def test_html5_entity_with_variation_selector(self):
+ # Some HTML5 entities correspond either to a single-character
+ # Unicode sequence _or_ to the same character plus U+FE00,
+ # VARIATION SELECTOR 1. We can handle this.
+ data = u"fjords \u2294 penguins"
+ markup = u"fjords &sqcup; penguins"
+ self.assertEqual(self.sub.substitute_html(data), markup)
+
+ data = u"fjords \u2294\ufe00 penguins"
+ markup = u"fjords &sqcups; penguins"
+ self.assertEqual(self.sub.substitute_html(data), markup)
+
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)