summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2016-07-16 11:51:31 -0400
committerLeonard Richardson <leonardr@segfault.org>2016-07-16 11:51:31 -0400
commite4ff05f0783605350171f6623d4055837c2af14f (patch)
tree7ed51d902a9705ce8e8fb84587fe715b9830ec5c
parentb0d82663f225c501f1417bcc155c156327134404 (diff)
Added a separate class for XML processing instructions, which have a slightly different format from SGML processing instructions. [bug=1504383]
-rw-r--r--bs4/builder/_htmlparser.py1
-rw-r--r--bs4/builder/_lxml.py8
-rw-r--r--bs4/element.py6
-rw-r--r--bs4/testing.py5
4 files changed, 19 insertions, 1 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 823ca15..b919be4 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -8,6 +8,7 @@ __all__ = [
]
from HTMLParser import HTMLParser
+from pdb import set_trace
try:
from HTMLParser import HTMLParseError
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index bfec582..67cdb4c 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -5,6 +5,7 @@ __all__ = [
'LXMLTreeBuilder',
]
+from pdb import set_trace
from io import BytesIO
from StringIO import StringIO
import collections
@@ -14,6 +15,7 @@ from bs4.element import (
Doctype,
NamespacedAttribute,
ProcessingInstruction,
+ XMLProcessingInstruction,
)
from bs4.builder import (
FAST,
@@ -105,6 +107,10 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
+ if is_html:
+ self.processing_instruction_class = ProcessingInstruction
+ else:
+ self.processing_instruction_class = XMLProcessingInstruction
try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
@@ -203,7 +209,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def pi(self, target, data):
self.soup.endData()
self.soup.handle_data(target + ' ' + data)
- self.soup.endData(ProcessingInstruction)
+ self.soup.endData(self.processing_instruction_class)
def data(self, content):
self.soup.handle_data(content)
diff --git a/bs4/element.py b/bs4/element.py
index 872ebf6..9e2bfec 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -745,10 +745,16 @@ class CData(PreformattedString):
SUFFIX = u']]>'
class ProcessingInstruction(PreformattedString):
+ """A SGML processing instruction."""
PREFIX = u'<?'
SUFFIX = u'>'
+class XMLProcessingInstruction(ProcessingInstruction):
+ """An XML processing instruction."""
+ PREFIX = u'<?'
+ SUFFIX = u'?>'
+
class Comment(PreformattedString):
PREFIX = u'<!--'
diff --git a/bs4/testing.py b/bs4/testing.py
index 4af16f6..1e2cc9c 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -565,6 +565,11 @@ class XMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
+ def test_processing_instruction(self):
+ markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.encode("utf8"))
+
def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>