diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 11:51:31 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 11:51:31 -0400 |
commit | e4ff05f0783605350171f6623d4055837c2af14f (patch) | |
tree | 7ed51d902a9705ce8e8fb84587fe715b9830ec5c | |
parent | b0d82663f225c501f1417bcc155c156327134404 (diff) |
Added a separate class for XML processing instructions, which have a slightly different format from SGML processing instructions. [bug=1504383]
-rw-r--r-- | bs4/builder/_htmlparser.py | 1 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 8 | ||||
-rw-r--r-- | bs4/element.py | 6 | ||||
-rw-r--r-- | bs4/testing.py | 5 |
4 files changed, 19 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 823ca15..b919be4 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -8,6 +8,7 @@ __all__ = [ ] from HTMLParser import HTMLParser +from pdb import set_trace try: from HTMLParser import HTMLParseError diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index bfec582..67cdb4c 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -5,6 +5,7 @@ __all__ = [ 'LXMLTreeBuilder', ] +from pdb import set_trace from io import BytesIO from StringIO import StringIO import collections @@ -14,6 +15,7 @@ from bs4.element import ( Doctype, NamespacedAttribute, ProcessingInstruction, + XMLProcessingInstruction, ) from bs4.builder import ( FAST, @@ -105,6 +107,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) @@ -203,7 +209,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def pi(self, target, data): self.soup.endData() self.soup.handle_data(target + ' ' + data) - self.soup.endData(ProcessingInstruction) + self.soup.endData(self.processing_instruction_class) def data(self, content): self.soup.handle_data(content) diff --git a/bs4/element.py b/bs4/element.py index 872ebf6..9e2bfec 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -745,10 +745,16 @@ class CData(PreformattedString): SUFFIX = u']]>' class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" PREFIX = u'<?' SUFFIX = u'>' +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = u'<?' + SUFFIX = u'?>' + class Comment(PreformattedString): PREFIX = u'<!--' diff --git a/bs4/testing.py b/bs4/testing.py index 4af16f6..1e2cc9c 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -565,6 +565,11 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) + def test_processing_instruction(self): + markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b"""<?xml version="1.0" encoding="utf-8"?> |