From 1b7c99d3dfc65e8b0448f06f084097947d3ce1a2 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 23 Oct 2021 17:13:58 -0400 Subject: Added a workaround for an lxml bug (https://bugs.launchpad.net/lxml/+bug/1948551) that caused problems when parsing a Unicode string beginning with BYTE ORDER MARK. [bug=1947768] --- bs4/builder/_lxml.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'bs4/builder/_lxml.py') diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 11c9a69..1334f94 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -172,6 +172,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? + + # TODO: This is a workaround for + # https://bugs.launchpad.net/lxml/+bug/1948551. + # We can remove it once the upstream issue is fixed. + if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}': + markup = markup[1:] yield markup, None, document_declared_encoding, False if isinstance(markup, str): -- cgit v1.2.3