diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 16:58:23 -0400 |
commit | 6e319a74343b9efb69517ab178dbea921f438ee1 (patch) | |
tree | 7c4e3dbd72b473211d7f9c4509a0a334ae53621f /bs4/tests | |
parent | 58cffa003e82049b78f14db73518d000cd05e3d6 (diff) |
Correctly handle invalid HTML numeric character entities like “
which reference code points that are not Unicode code points. Note
that this is only fixed when Beautiful Soup is used with the
html.parser parser -- html5lib already worked and I couldn't fix it
with lxml. [bug=1782933]
Diffstat (limited to 'bs4/tests')
-rw-r--r-- | bs4/tests/test_lxml.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 2 |
2 files changed, 7 insertions, 1 deletions
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index a05870b..23cbaef 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") + def test_entities_in_original_document_encoding(self): + # We can't implement this case correctly because by the time we + # hear about markup like "“", it's been (incorrectly) converted into + # a string like u'\x93' + pass + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this # test if an old version of lxml is installed. diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index e5cc47e..e5dcfa7 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -1206,7 +1206,7 @@ class TestElementObjects(SoupTest): tag = soup.bTag self.assertEqual(soup.b, tag) self.assertEqual( - '.bTag is deprecated, use .find("b") instead.', + '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")', str(w[0].message)) def test_has_attr(self): |