diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-07-30 07:55:09 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-07-30 07:55:09 -0400 |
commit | 0fe84d4d409273b6fadefe85e328e569b3296cfe (patch) | |
tree | 8c1a1d3e52c078c0daff87f7ae2a5e4b77a2fcc5 /bs4/builder | |
parent | a15e40b89282ef6924873e41bb08be2eb440880f (diff) |
Explained why we test both unicode and bytestring processing instructions.
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/_lxml.py | 23 |
1 files changed, 12 insertions, 11 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 34bb14e..d2ca287 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -32,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True + processing_instruction_class = XMLProcessingInstruction NAME = "lxml-xml" ALTERNATE_NAMES = ["xml"] @@ -90,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? @@ -101,16 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. - is_html = not self.is_xml - if is_html: - pass - # self.processing_instruction_class = ProcessingInstruction - else: - self.processing_instruction_class = XMLProcessingInstruction try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) @@ -236,8 +237,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): ALTERNATE_NAMES = ["lxml-html"] features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] - processing_instruction_class = ProcessingInstruction is_xml = False + processing_instruction_class = ProcessingInstruction def default_parser(self, encoding): return etree.HTMLParser |