diff options
author | Leonard Richardson <leonardr@segfault.org> | 2021-10-24 21:15:31 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2021-10-24 21:15:31 -0400 |
commit | c1a7aaae7140897b2e845be8c5aa077d6654ee0a (patch) | |
tree | df6a58adc912d111e619094d7884d034a6649249 /bs4/builder/_lxml.py | |
parent | dd8aa7237b88569c99e85b300b0cf537aeaebfbd (diff) |
Issue a warning when an HTML parser is used to parse a document that
looks like XML but not XHTML. [bug=1939121]
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r-- | bs4/builder/_lxml.py | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 1334f94..d8251b2 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -22,6 +22,7 @@ from bs4.element import ( XMLProcessingInstruction, ) from bs4.builder import ( + DetectsXMLParsedAsHTML, FAST, HTML, HTMLTreeBuilder, @@ -166,6 +167,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction + # We're in HTML mode, so if we're given XML, that's worth + # noting. + DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup) else: self.processing_instruction_class = XMLProcessingInstruction @@ -271,7 +275,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): namespace, name = self._getNsTag(name) nsprefix = self._prefix_for_namespace(namespace) self.soup.handle_starttag(name, namespace, nsprefix, attrs) - + def _prefix_for_namespace(self, namespace): """Find the currently active prefix for the given namespace.""" if namespace is None: @@ -299,9 +303,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): def pi(self, target, data): self.soup.endData() - self.soup.handle_data(target + ' ' + data) + data = target + ' ' + data + self.soup.handle_data(data) self.soup.endData(self.processing_instruction_class) - + def data(self, content): self.soup.handle_data(content) |