diff options
Diffstat (limited to 'bs4/tests')
-rw-r--r-- | bs4/tests/test_html5lib.py | 36 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 32 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 45 |
4 files changed, 114 insertions, 1 deletions
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index 7b0a6d4..2adebc8 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -188,3 +188,39 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): # because there's no way of knowing, when a string is created, # where in the tree it will eventually end up. pass + + def test_html5_attributes(self): + # The html5lib TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + # + # This is a copy of the same test from + # HTMLParserTreeBuilderSmokeTest. It's not in the superclass + # because the lxml HTML TreeBuilder _doesn't_ work this way. + for input_element, output_unicode, output_element in ( + ("⇄", u'\u21c4', b'⇄'), + ('⊧', u'\u22a7', b'⊧'), + ('𝔑', u'\U0001d511', b'𝔑'), + ('≧̸', u'\u2267\u0338', b'≧̸'), + ('¬', u'\xac', b'¬'), + ('⫬', u'\u2aec', b'⫬'), + ('"', u'"', b'"'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ("fj", u'fj', b'fj'), + ("⊔", u'\u2294', b'⊔'), + ("⊔︀", u'\u2294\ufe00', b'⊔︀'), + ("'", u"'", b"'"), + ("|", u"|", b"|"), + ): + markup = u'<div>%s</div>' % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"<div>%s</div>" % output_unicode.encode("utf8") + self.assertEquals(without_element, expect) + + with_element = div.encode(formatter="html") + expect = b"<div>%s</div>" % output_element + self.assertEquals(with_element, expect) diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index db85d2d..e84eced 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -88,6 +88,38 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): attrs[key].append(value) assert_attribute(accumulate, ["url1", "url2", "url3"]) + def test_html5_attributes(self): + # The html.parser TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + for input_element, output_unicode, output_element in ( + ("⇄", u'\u21c4', b'⇄'), + ('⊧', u'\u22a7', b'⊧'), + ('𝔑', u'\U0001d511', b'𝔑'), + ('≧̸', u'\u2267\u0338', b'≧̸'), + ('¬', u'\xac', b'¬'), + ('⫬', u'\u2aec', b'⫬'), + ('"', u'"', b'"'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ('∴', u'\u2234', b'∴'), + ("fj", u'fj', b'fj'), + ("⊔", u'\u2294', b'⊔'), + ("⊔︀", u'\u2294\ufe00', b'⊔︀'), + ("'", u"'", b"'"), + ("|", u"|", b"|"), + ): + markup = u'<div>%s</div>' % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"<div>%s</div>" % output_unicode.encode("utf8") + self.assertEquals(without_element, expect) + + with_element = div.encode(formatter="html") + expect = b"<div>%s</div>" % output_element + self.assertEquals(with_element, expect) + class TestHTMLParserSubclass(SoupTest): def test_error(self): diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index f96e4ae..d8dada4 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): "<p>foo�bar</p>", "<p>foobar</p>") self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") - + def test_entities_in_foreign_document_encoding(self): # We can't implement this case correctly because by the time we # hear about markup like "“", it's been (incorrectly) converted into diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index ddb6446..9074bdb 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -368,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase): self.assertEqual(self.sub.substitute_html(dammit.markup), "‘’foo“”") + def test_html5_entity(self): + # Some HTML5 entities correspond to single- or multi-character + # Unicode sequences. + + for entity, u in ( + # A few spot checks of our ability to recognize + # special character sequences and convert them + # to named entities. + ('⊧', u'\u22a7'), + ('𝔑', u'\U0001d511'), + ('≧̸', u'\u2267\u0338'), + ('¬', u'\xac'), + ('⫬', u'\u2aec'), + + # We _could_ convert | to &verbarr;, but we don't, because + # | is an ASCII character. + ('|' '|'), + + # Similarly for the fj ligature, which we could convert to + # fj, but we don't. + ("fj", "fj"), + + # We do convert _these_ ASCII characters to HTML entities, + # because that's required to generate valid HTML. + ('>', '>'), + ('<', '<'), + ('&', '&'), + ): + template = u'3 %s 4' + raw = template % u + with_entities = template % entity + self.assertEqual(self.sub.substitute_html(raw), with_entities) + + def test_html5_entity_with_variation_selector(self): + # Some HTML5 entities correspond either to a single-character + # Unicode sequence _or_ to the same character plus U+FE00, + # VARIATION SELECTOR 1. We can handle this. + data = u"fjords \u2294 penguins" + markup = u"fjords ⊔ penguins" + self.assertEqual(self.sub.substitute_html(data), markup) + + data = u"fjords \u2294\ufe00 penguins" + markup = u"fjords ⊔︀ penguins" + self.assertEqual(self.sub.substitute_html(data), markup) + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, False), s) |