Added a workaround for an lxml bug (https://bugs.launchpad.net/lxml/+bug/1948551) that caused

problems when parsing a Unicode string beginning with BYTE ORDER MARK. [bug=1947768]
author: Leonard Richardson <leonardr@segfault.org> 2021-10-23 17:13:58 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2021-10-23 17:13:58 -0400
commit: 1b7c99d3dfc65e8b0448f06f084097947d3ce1a2 (patch)
tree: 37e907220d500b38793be279ea1d2e454e8ca98a
parent: 0afd48f8b3069fb3577749374e20611b231cb829 (diff)
4 files changed, 17 insertions, 1 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 6e9e66d..fcd203d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -14,6 +14,11 @@ Python 2 was revision 605.
 * Fixed a crash when overriding multi_valued_attributes and using the
   html5lib parser. [bug=1948488]
 
+* Added a workaround for an lxml bug
+  (https://bugs.launchpad.net/lxml/+bug/1948551) that caused
+  problems when parsing a Unicode string beginning with BYTE ORDER MARK.
+  [bug=1947768]
+
 = 4.10.0 (20210907)
 
 * This is the first release of Beautiful Soup to only support Python
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 11c9a69..1334f94 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -172,6 +172,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         if isinstance(markup, str):
             # We were given Unicode. Maybe lxml can parse Unicode on
             # this system?
+
+            # TODO: This is a workaround for
+            # https://bugs.launchpad.net/lxml/+bug/1948551.
+            # We can remove it once the upstream issue is fixed.
+            if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
+                markup = markup[1:]
             yield markup, None, document_declared_encoding, False
 
         if isinstance(markup, str):
diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index 6677d2e..5147f0e 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -1024,6 +1024,11 @@ class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest):
         soup = self.soup(markup)
         assert 'Sacr\xe9 bleu!' == soup.root.string
 
+    def test_can_parse_unicode_document_begining_with_bom(self):
+        markup = '\N{BYTE ORDER MARK}<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+        soup = self.soup(markup)
+        assert 'Sacr\xe9 bleu!' == soup.root.string
+        
     def test_popping_namespaced_tag(self):
         markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
         soup = self.soup(markup)
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 1269e57..be954db 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -87,7 +87,7 @@ class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
 @skipIf(
     not LXML_PRESENT,
     "lxml seems not to be present, not testing its XML tree builder.")
-class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
+class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
     """See ``HTMLTreeBuilderSmokeTest``."""
 
     @property
author	Leonard Richardson <leonardr@segfault.org>	2021-10-23 17:13:58 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2021-10-23 17:13:58 -0400
commit	1b7c99d3dfc65e8b0448f06f084097947d3ce1a2 (patch)
tree	37e907220d500b38793be279ea1d2e454e8ca98a
parent	0afd48f8b3069fb3577749374e20611b231cb829 (diff)