summaryrefslogtreecommitdiff
path: root/bs4/builder
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder')
-rw-r--r--bs4/builder/_html5lib.py233
-rw-r--r--bs4/builder/_lxml.py108
2 files changed, 341 insertions, 0 deletions
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
new file mode 100644
index 0000000..d74c4b0
--- /dev/null
+++ b/bs4/builder/_html5lib.py
@@ -0,0 +1,233 @@
+__all__ = [
+ 'HTML5TreeBuilder',
+ ]
+
+from bs4.builder import (
+ PERMISSIVE,
+ HTML,
+ HTML_5,
+ HTMLTreeBuilder,
+ )
+import html5lib
+from html5lib.constants import DataLossWarning
+import warnings
+from bs4.element import (
+ Comment,
+ Doctype,
+ NavigableString,
+ Tag,
+ )
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+ """Use html5lib to build a tree."""
+
+ features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+
+ def prepare_markup(self, markup, user_specified_encoding):
+ # Store the user-specified encoding for use later on.
+ self.user_specified_encoding = user_specified_encoding
+ return markup, None, None
+
+ # These methods are defined by Beautiful Soup.
+ def feed(self, markup):
+ parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+ doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+ # Set the character encoding detected by the tokenizer.
+ if isinstance(markup, unicode):
+ # We need to special-case this because html5lib sets
+ # charEncoding to UTF-8 if it gets Unicode input.
+ doc.original_encoding = None
+ else:
+ doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+
+ def create_treebuilder(self, namespaceHTMLElements):
+ self.underlying_builder = TreeBuilderForHtml5lib(
+ self.soup, namespaceHTMLElements)
+ return self.underlying_builder
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+
+ def __init__(self, soup, namespaceHTMLElements):
+ self.soup = soup
+ if namespaceHTMLElements:
+ warnings.warn("namespaceHTMLElements not supported yet",
+ DataLossWarning)
+ super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+ def documentClass(self):
+ self.soup.reset()
+ return Element(self.soup, self.soup, None)
+
+ def insertDoctype(self, token):
+ name = token["name"]
+ publicId = token["publicId"]
+ systemId = token["systemId"]
+
+ doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+ self.soup.object_was_parsed(doctype)
+
+ def elementClass(self, name, namespace):
+ if namespace is not None:
+ warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
+ return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
+
+ def commentClass(self, data):
+ return TextNode(Comment(data), self.soup)
+
+ def fragmentClass(self):
+ self.soup = BeautifulSoup("")
+ self.soup.name = "[document_fragment]"
+ return Element(self.soup, self.soup, None)
+
+ def appendChild(self, node):
+ self.soup.insert(len(self.soup.contents), node.element)
+
+ def testSerializer(self, element):
+ return testSerializer(element)
+
+ def getDocument(self):
+ return self.soup
+
+ def getFragment(self):
+ return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+ def __init__(self, element):
+ self.element = element
+ self.attrs = dict(self.element.attrs)
+ def __iter__(self):
+ return self.attrs.items().__iter__()
+ def __setitem__(self, name, value):
+ "set attr", name, value
+ self.element[name] = value
+ def items(self):
+ return self.attrs.items()
+ def keys(self):
+ return self.attrs.keys()
+ def __getitem__(self, name):
+ return self.attrs[name]
+ def __contains__(self, name):
+ return name in self.attrs.keys()
+
+
+class Element(html5lib.treebuilders._base.Node):
+ def __init__(self, element, soup, namespace):
+ html5lib.treebuilders._base.Node.__init__(self, element.name)
+ self.element = element
+ self.soup = soup
+ self.namespace = namespace
+
+ def _nodeIndex(self, node, refNode):
+ # Finds a node by identity rather than equality
+ for index in range(len(self.element.contents)):
+ if id(self.element.contents[index]) == id(refNode.element):
+ return index
+ return None
+
+ def appendChild(self, node):
+ if (node.element.__class__ == NavigableString and self.element.contents
+ and self.element.contents[-1].__class__ == NavigableString):
+ # Concatenate new text onto old text node
+ # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
+ newStr = NavigableString(self.element.contents[-1]+node.element)
+
+ # Remove the old text node
+ # (Can't simply use .extract() by itself, because it fails if
+ # an equal text node exists within the parent node)
+ oldElement = self.element.contents[-1]
+ del self.element.contents[-1]
+ oldElement.parent = None
+ oldElement.extract()
+
+ self.element.insert(len(self.element.contents), newStr)
+ else:
+ self.element.insert(len(self.element.contents), node.element)
+ node.parent = self
+
+ def getAttributes(self):
+ return AttrList(self.element)
+
+ def setAttributes(self, attributes):
+ if attributes is not None and attributes != {}:
+ for name, value in attributes.items():
+ self.element[name] = value
+ # The attributes may contain variables that need substitution.
+ # Call set_up_substitutions manually.
+ # The Tag constructor calls this method automatically,
+ # but html5lib creates a Tag object before setting up
+ # the attributes.
+ self.element.contains_substitutions = (
+ self.soup.builder.set_up_substitutions(
+ self.element))
+ attributes = property(getAttributes, setAttributes)
+
+ def insertText(self, data, insertBefore=None):
+ text = TextNode(NavigableString(data), self.soup)
+ if insertBefore:
+ self.insertBefore(text, insertBefore)
+ else:
+ self.appendChild(text)
+
+ def insertBefore(self, node, refNode):
+ index = self._nodeIndex(node, refNode)
+ if (node.element.__class__ == NavigableString and self.element.contents
+ and self.element.contents[index-1].__class__ == NavigableString):
+ # (See comments in appendChild)
+ newStr = NavigableString(self.element.contents[index-1]+node.element)
+ oldNode = self.element.contents[index-1]
+ del self.element.contents[index-1]
+ oldNode.parent = None
+ oldNode.extract()
+
+ self.element.insert(index-1, newStr)
+ else:
+ self.element.insert(index, node.element)
+ node.parent = self
+
+ def removeChild(self, node):
+ index = self._nodeIndex(node.parent, node)
+ del node.parent.element.contents[index]
+ node.element.parent = None
+ node.element.extract()
+ node.parent = None
+
+ def reparentChildren(self, newParent):
+ while self.element.contents:
+ child = self.element.contents[0]
+ child.extract()
+ if isinstance(child, Tag):
+ newParent.appendChild(Element(child, self.soup, namespaces["html"]))
+ else:
+ newParent.appendChild(TextNode(child, self.soup))
+
+ def cloneNode(self):
+ node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace)
+ for key,value in self.attributes:
+ node.attributes[key] = value
+ return node
+
+ def hasContent(self):
+ return self.element.contents
+
+ def getNameTuple(self):
+ if self.namespace == None:
+ return namespaces["html"], self.name
+ else:
+ return self.namespace, self.name
+
+ nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+ def __init__(self, element, soup):
+ html5lib.treebuilders._base.Node.__init__(self, None)
+ self.element = element
+ self.soup = soup
+
+ def cloneNode(self):
+ raise NotImplementedError
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
new file mode 100644
index 0000000..5c9bc57
--- /dev/null
+++ b/bs4/builder/_lxml.py
@@ -0,0 +1,108 @@
+__all__ = [
+ 'LXMLTreeBuilderForXML',
+ 'LXMLTreeBuilder',
+ ]
+
+from lxml import etree
+from bs4.element import Comment, Doctype
+from bs4.builder import (
+ FAST,
+ HTML,
+ HTMLTreeBuilder,
+ PERMISSIVE,
+ TreeBuilder,
+ XML)
+from bs4.dammit import UnicodeDammit
+import types
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+ DEFAULT_PARSER_CLASS = etree.XMLParser
+
+ is_xml = True
+
+ # Well, it's permissive by XML parser standards.
+ features = [LXML, XML, FAST, PERMISSIVE]
+
+ @property
+ def default_parser(self):
+ # This can either return a parser object or a class, which
+ # will be instantiated with default arguments.
+ return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+ def __init__(self, parser=None, empty_element_tags=None):
+ if empty_element_tags is not None:
+ self.empty_element_tags = set(empty_element_tags)
+ if parser is None:
+ # Use the default parser.
+ parser = self.default_parser
+ if callable(parser):
+ # Instantiate the parser with default arguments
+ parser = parser(target=self, strip_cdata=False)
+ self.parser = parser
+ self.soup = None
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 3-tuple (markup, original encoding, encoding
+ declared within markup).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding)
+
+ def feed(self, markup):
+ self.parser.feed(markup)
+ self.parser.close()
+
+ def close(self):
+ pass
+
+ def start(self, name, attrs):
+ self.soup.handle_starttag(name, attrs)
+
+ def end(self, name):
+ self.soup.endData()
+ completed_tag = self.soup.tagStack[-1]
+ self.soup.handle_endtag(name)
+
+ def pi(self, target, data):
+ pass
+
+ def data(self, content):
+ self.soup.handle_data(content)
+
+ def doctype(self, name, pubid, system):
+ self.soup.endData()
+ doctype = Doctype.for_name_and_ids(name, pubid, system)
+ self.soup.object_was_parsed(doctype)
+
+ def comment(self, content):
+ "Handle comments as Comment objects."
+ self.soup.endData()
+ self.soup.handle_data(content)
+ self.soup.endData(Comment)
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+ features = [LXML, HTML, FAST]
+ is_xml = False
+
+ @property
+ def default_parser(self):
+ return etree.HTMLParser
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<html><body>%s</body></html>' % fragment