summaryrefslogtreecommitdiff
path: root/bs4/tests/__init__.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-10-23 17:13:58 -0400
committerLeonard Richardson <leonardr@segfault.org>2021-10-23 17:13:58 -0400
commit1b7c99d3dfc65e8b0448f06f084097947d3ce1a2 (patch)
tree37e907220d500b38793be279ea1d2e454e8ca98a /bs4/tests/__init__.py
parent0afd48f8b3069fb3577749374e20611b231cb829 (diff)
Added a workaround for an lxml bug (https://bugs.launchpad.net/lxml/+bug/1948551) that caused
problems when parsing a Unicode string beginning with BYTE ORDER MARK. [bug=1947768]
Diffstat (limited to 'bs4/tests/__init__.py')
-rw-r--r--bs4/tests/__init__.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index 6677d2e..5147f0e 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -1024,6 +1024,11 @@ class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
soup = self.soup(markup)
assert 'Sacr\xe9 bleu!' == soup.root.string
+ def test_can_parse_unicode_document_begining_with_bom(self):
+ markup = '\N{BYTE ORDER MARK}<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+ soup = self.soup(markup)
+ assert 'Sacr\xe9 bleu!' == soup.root.string
+
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)