From 1b7c99d3dfc65e8b0448f06f084097947d3ce1a2 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Sat, 23 Oct 2021 17:13:58 -0400
Subject: Added a workaround for an lxml bug  
 (https://bugs.launchpad.net/lxml/+bug/1948551) that caused   problems when
 parsing a Unicode string beginning with BYTE ORDER MARK.   [bug=1947768]

---
 bs4/builder/_lxml.py | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'bs4/builder/_lxml.py')

diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 11c9a69..1334f94 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -172,6 +172,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
         if isinstance(markup, str):
             # We were given Unicode. Maybe lxml can parse Unicode on
             # this system?
+
+            # TODO: This is a workaround for
+            # https://bugs.launchpad.net/lxml/+bug/1948551.
+            # We can remove it once the upstream issue is fixed.
+            if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
+                markup = markup[1:]
             yield markup, None, document_declared_encoding, False
 
         if isinstance(markup, str):
-- 
cgit v1.2.3