diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-10-23 17:13:58 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-10-23 17:13:58 -0400 |
commit | 1b7c99d3dfc65e8b0448f06f084097947d3ce1a2 (patch) | |
tree | 37e907220d500b38793be279ea1d2e454e8ca98a | |
parent | 0afd48f8b3069fb3577749374e20611b231cb829 (diff) |
Added a workaround for an lxml bug (https://bugs.launchpad.net/lxml/+bug/1948551) that caused
problems when parsing a Unicode string beginning with BYTE ORDER MARK.
[bug=1947768]
-rw-r--r-- | CHANGELOG | 5 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 6 | ||||
-rw-r--r-- | bs4/tests/__init__.py | 5 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 2 |
4 files changed, 17 insertions, 1 deletions
@@ -14,6 +14,11 @@ Python 2 was revision 605. * Fixed a crash when overriding multi_valued_attributes and using the html5lib parser. [bug=1948488] +* Added a workaround for an lxml bug + (https://bugs.launchpad.net/lxml/+bug/1948551) that caused + problems when parsing a Unicode string beginning with BYTE ORDER MARK. + [bug=1947768] + = 4.10.0 (20210907) * This is the first release of Beautiful Soup to only support Python diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 11c9a69..1334f94 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -172,6 +172,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? + + # TODO: This is a workaround for + # https://bugs.launchpad.net/lxml/+bug/1948551. + # We can remove it once the upstream issue is fixed. + if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}': + markup = markup[1:] yield markup, None, document_declared_encoding, False if isinstance(markup, str): diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py index 6677d2e..5147f0e 100644 --- a/bs4/tests/__init__.py +++ b/bs4/tests/__init__.py @@ -1024,6 +1024,11 @@ class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest): soup = self.soup(markup) assert 'Sacr\xe9 bleu!' == soup.root.string + def test_can_parse_unicode_document_begining_with_bom(self): + markup = '\N{BYTE ORDER MARK}<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' + soup = self.soup(markup) + assert 'Sacr\xe9 bleu!' == soup.root.string + def test_popping_namespaced_tag(self): markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' soup = self.soup(markup) diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 1269e57..be954db 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -87,7 +87,7 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest): @skipIf( not LXML_PRESENT, "lxml seems not to be present, not testing its XML tree builder.") -class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): +class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest): """See ``HTMLTreeBuilderSmokeTest``.""" @property |