summaryrefslogtreecommitdiff
path: root/bs4/builder/_lxml.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2021-10-23 17:13:58 -0400
committerLeonard Richardson <leonardr@segfault.org>2021-10-23 17:13:58 -0400
commit1b7c99d3dfc65e8b0448f06f084097947d3ce1a2 (patch)
tree37e907220d500b38793be279ea1d2e454e8ca98a /bs4/builder/_lxml.py
parent0afd48f8b3069fb3577749374e20611b231cb829 (diff)
Added a workaround for an lxml bug (https://bugs.launchpad.net/lxml/+bug/1948551) that caused
problems when parsing a Unicode string beginning with BYTE ORDER MARK. [bug=1947768]
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r--bs4/builder/_lxml.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 11c9a69..1334f94 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -172,6 +172,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
+
+ # TODO: This is a workaround for
+ # https://bugs.launchpad.net/lxml/+bug/1948551.
+ # We can remove it once the upstream issue is fixed.
+ if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
+ markup = markup[1:]
yield markup, None, document_declared_encoding, False
if isinstance(markup, str):