diff options
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 61 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 22 |
4 files changed, 71 insertions, 21 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index f8fce56..601979b 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -1,9 +1,13 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + HTMLAwareEntitySubstitution, whitespace_re ) @@ -227,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = set(['pre', 'textarea']) + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 755518d..5f54893 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -1,8 +1,10 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTML5TreeBuilder', ] -from pdb import set_trace import warnings import re from bs4.builder import ( @@ -27,6 +29,15 @@ from bs4.element import ( Tag, ) +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" @@ -51,7 +62,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) + + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): @@ -59,7 +77,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( @@ -71,7 +95,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): return u'<html><head></head><body>%s</body></html>' % fragment -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def __init__(self, namespaceHTMLElements, soup=None): if soup: @@ -114,7 +138,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return self.soup def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + return treebuilder_base.TreeBuilder.getFragment(self).element def testSerializer(self, element): from bs4 import BeautifulSoup @@ -196,9 +220,9 @@ class AttrList(object): return name in list(self.attrs.keys()) -class Element(html5lib.treebuilders._base.Node): +class Element(treebuilder_base.Node): def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) + treebuilder_base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace @@ -258,6 +282,8 @@ class Element(html5lib.treebuilders._base.Node): most_recent_element=most_recent_element) def getAttributes(self): + if isinstance(self.element, Comment): + return {} return AttrList(self.element) def setAttributes(self, attributes): @@ -311,6 +337,7 @@ class Element(html5lib.treebuilders._base.Node): # print "MOVE", self.element.contents # print "FROM", self.element # print "TO", new_parent.element + element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -329,7 +356,6 @@ class Element(html5lib.treebuilders._base.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent @@ -346,12 +372,19 @@ class Element(html5lib.treebuilders._base.Node): if new_parents_last_child: new_parents_last_child.next_sibling = first_child - # Fix the last child's next_element and next_sibling - last_child = to_append[-1] - last_child.next_element = new_parents_last_descendant_next_element + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element if new_parents_last_descendant_next_element: - new_parents_last_descendant_next_element.previous_element = last_child - last_child.next_sibling = None + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None for child in to_append: child.parent = new_parent_element @@ -385,7 +418,7 @@ class Element(html5lib.treebuilders._base.Node): class TextNode(Element): def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) + treebuilder_base.Node.__init__(self, None) self.element = element self.soup = soup diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 0101d64..823ca15 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -1,5 +1,8 @@ """Use the HTMLParser library to parse HTML files that aren't too bad.""" +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTMLParserTreeBuilder', ] diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 9e8f88f..d2ca287 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -1,3 +1,5 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', @@ -12,6 +14,7 @@ from bs4.element import ( Doctype, NamespacedAttribute, ProcessingInstruction, + XMLProcessingInstruction, ) from bs4.builder import ( FAST, @@ -29,6 +32,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True + processing_instruction_class = XMLProcessingInstruction NAME = "lxml-xml" ALTERNATE_NAMES = ["xml"] @@ -87,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? @@ -98,11 +112,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. - is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) @@ -201,7 +210,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def pi(self, target, data): self.soup.endData() self.soup.handle_data(target + ' ' + data) - self.soup.endData(ProcessingInstruction) + self.soup.endData(self.processing_instruction_class) def data(self, content): self.soup.handle_data(content) @@ -229,6 +238,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False + processing_instruction_class = ProcessingInstruction def default_parser(self, encoding): return etree.HTMLParser |