From 6780794cdcc52305691f7a299015bfb9b5941e27 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 8 Dec 2015 15:47:30 +0000 Subject: Make TreeBuilderForHtml5lib strictly follow the html5lib API. This slightly changes the constructor (to make soup optional), and adds a testSerializer method so the tests can be run against it. --- bs4/builder/_html5lib.py | 69 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) (limited to 'bs4/builder/_html5lib.py') diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 8725a65..a535747 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -4,6 +4,7 @@ __all__ = [ from pdb import set_trace import warnings +import re from bs4.builder import ( PERMISSIVE, HTML, @@ -15,7 +16,10 @@ from bs4.element import ( whitespace_re, ) import html5lib -from html5lib.constants import namespaces +from html5lib.constants import ( + namespaces, + prefixes, + ) from bs4.element import ( Comment, Doctype, @@ -59,7 +63,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) + namespaceHTMLElements, self.soup) return self.underlying_builder def test_fragment_to_document(self, fragment): @@ -69,8 +73,12 @@ class HTML5TreeBuilder(HTMLTreeBuilder): class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -93,7 +101,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return TextNode(Comment(data), self.soup) def fragmentClass(self): - self.soup = BeautifulSoup("") + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) @@ -107,6 +116,56 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def getFragment(self): return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s" % (' ' * indent, name)) + else: + rv.append("|%s" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in element.attrs.items(): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + class AttrList(object): def __init__(self, element): self.element = element -- cgit v1.2.3 From a07a28ac10e5686d096fd0f4225e01ff137ec80f Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 8 Dec 2015 16:02:57 +0000 Subject: Fix foster parenting with html5lib. This makes all of the html5lib tests pass. Yay! --- bs4/builder/_html5lib.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'bs4/builder/_html5lib.py') diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index a535747..755518d 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -217,8 +217,10 @@ class Element(html5lib.treebuilders._base.Node): child = node elif node.element.__class__ == NavigableString: string_child = child = node.element + node.parent = self else: child = node.element + node.parent = self if not isinstance(child, basestring) and child.parent is not None: node.element.extract() @@ -283,11 +285,11 @@ class Element(html5lib.treebuilders._base.Node): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: - text = TextNode(self.soup.new_string(data), self.soup) - self.insertBefore(data, insertBefore) + self.insertBefore(text, insertBefore) else: - self.appendChild(data) + self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) -- cgit v1.2.3