summaryrefslogtreecommitdiff
path: root/beautifulsoup/builder
diff options
context:
space:
mode:
Diffstat (limited to 'beautifulsoup/builder')
-rw-r--r--beautifulsoup/builder/__init__.py259
-rw-r--r--beautifulsoup/builder/_html5lib.py233
-rw-r--r--beautifulsoup/builder/_lxml.py108
3 files changed, 0 insertions, 600 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
deleted file mode 100644
index 10c6b7f..0000000
--- a/beautifulsoup/builder/__init__.py
+++ /dev/null
@@ -1,259 +0,0 @@
-from collections import defaultdict
-import re
-import sys
-
-__all__ = [
- 'HTMLTreeBuilder',
- 'SAXTreeBuilder',
- 'TreeBuilder',
- 'TreeBuilderRegistry',
- ]
-
-# Some useful features for a TreeBuilder to have.
-FAST = 'fast'
-PERMISSIVE = 'permissive'
-XML = 'xml'
-HTML = 'html'
-HTML_5 = 'html5'
-
-
-class TreeBuilderRegistry(object):
-
- def __init__(self):
- self.builders_for_feature = defaultdict(list)
- self.builders = []
-
- def register(self, treebuilder_class):
- """Register a treebuilder based on its advertised features."""
- for feature in treebuilder_class.features:
- self.builders_for_feature[feature].insert(0, treebuilder_class)
- self.builders.insert(0, treebuilder_class)
-
- def lookup(self, *features):
- if len(self.builders) == 0:
- # There are no builders at all.
- return None
-
- if len(features) == 0:
- # They didn't ask for any features. Give them the most
- # recently registered builder.
- return self.builders[0]
-
- # Go down the list of features in order, and eliminate any builders
- # that don't match every feature.
- features = list(features)
- features.reverse()
- candidates = None
- candidate_set = None
- while len(features) > 0:
- feature = features.pop()
- we_have_the_feature = self.builders_for_feature.get(feature, [])
- if len(we_have_the_feature) > 0:
- if candidates is None:
- candidates = we_have_the_feature
- candidate_set = set(candidates)
- else:
- # Eliminate any candidates that don't have this feature.
- candidate_set = candidate_set.intersection(
- set(we_have_the_feature))
-
- # The only valid candidates are the ones in candidate_set.
- # Go through the original list of candidates and pick the first one
- # that's in candidate_set.
- if candidate_set is None:
- return None
- for candidate in candidates:
- if candidate in candidate_set:
- return candidate
- return None
-
-# The BeautifulSoup class will take feature lists from developers and use them
-# to look up builders in this registry.
-builder_registry = TreeBuilderRegistry()
-
-
-class TreeBuilder(object):
- """Turn a document into a Beautiful Soup object tree."""
-
- features = []
-
- is_xml = False
- preserve_whitespace_tags = set()
- empty_element_tags = None # A tag will be considered an empty-element
- # tag when and only when it has no contents.
-
- def __init__(self):
- self.soup = None
-
- def reset(self):
- pass
-
- def can_be_empty_element(self, tag_name):
- """Might a tag with this name be an empty-element tag?
-
- The final markup may or may not actually present this tag as
- self-closing.
-
- For instance: an HTMLBuilder does not consider a <p> tag to be
- an empty-element tag (it's not in
- HTMLBuilder.empty_element_tags). This means an empty <p> tag
- will be presented as "<p></p>", not "<p />".
-
- The default implementation has no opinion about which tags are
- empty-element tags, so a tag will be presented as an
- empty-element tag if and only if it has no contents.
- "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
- be left alone.
- """
- if self.empty_element_tags is None:
- return True
- return tag_name in self.empty_element_tags
-
- def feed(self, markup):
- raise NotImplementedError()
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- return markup, None, None
-
- def test_fragment_to_document(self, fragment):
- """Wrap an HTML fragment to make it look like a document.
-
- Different parsers do this differently. For instance, lxml
- introduces an empty <head> tag, and html5lib
- doesn't. Abstracting this away lets us write simple tests
- which run HTML fragments through the parser and compare the
- results against other HTML fragments.
-
- This method should not be used outside of tests.
- """
- return fragment
-
- def set_up_substitutions(self, tag):
- pass
-
-
-class SAXTreeBuilder(TreeBuilder):
- """A Beautiful Soup treebuilder that listens for SAX events."""
-
- def feed(self, markup):
- raise NotImplementedError()
-
- def close(self):
- pass
-
- def startElement(self, name, attrs):
- attrs = dict((key[1], value) for key, value in attrs.items())
- #print "Start %s, %r" % (name, attrs)
- self.soup.handle_starttag(name, attrs)
-
- def endElement(self, name):
- #print "End %s" % name
- self.soup.handle_endtag(name)
-
- def startElementNS(self, nsTuple, nodeName, attrs):
- # Throw away (ns, nodeName) for now.
- self.startElement(nodeName, attrs)
-
- def endElementNS(self, nsTuple, nodeName):
- # Throw away (ns, nodeName) for now.
- self.endElement(nodeName)
- #handler.endElementNS((ns, node.nodeName), node.nodeName)
-
- def startPrefixMapping(self, prefix, nodeValue):
- # Ignore the prefix for now.
- pass
-
- def endPrefixMapping(self, prefix):
- # Ignore the prefix for now.
- # handler.endPrefixMapping(prefix)
- pass
-
- def characters(self, content):
- self.soup.handle_data(content)
-
- def startDocument(self):
- pass
-
- def endDocument(self):
- pass
-
-
-class HTMLTreeBuilder(TreeBuilder):
- """This TreeBuilder knows facts about HTML.
-
- Such as which tags are empty-element tags.
- """
-
- preserve_whitespace_tags = set(['pre', 'textarea'])
- empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
-
- # Used by set_up_substitutions to detect the charset in a META tag
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
- def set_up_substitutions(self, tag):
- if tag.name != 'meta':
- return False
-
- http_equiv = tag.get('http-equiv')
- content = tag.get('content')
-
- if (http_equiv is not None
- and content is not None
- and http_equiv.lower() == 'content-type'):
- # This is an interesting meta tag.
- match = self.CHARSET_RE.search(content)
- if match:
- if (self.soup.declared_html_encoding is not None or
- self.soup.original_encoding == self.soup.from_encoding):
- # An HTML encoding was sniffed while converting
- # the document to Unicode, or an HTML encoding was
- # sniffed during a previous pass through the
- # document, or an encoding was specified
- # explicitly and it worked. Rewrite the meta tag.
- def rewrite(match):
- return match.group(1) + "%SOUP-ENCODING%"
- tag['content'] = self.CHARSET_RE.sub(rewrite, content)
- return True
- else:
- # This is our first pass through the document.
- # Go through it again with the encoding information.
- new_charset = match.group(3)
- if (new_charset is not None
- and new_charset != self.soup.original_encoding):
- self.soup.declared_html_encoding = new_charset
- self.soup._feed(self.soup.declared_html_encoding)
- raise StopParsing
- pass
- return False
-
-
-def register_treebuilders_from(module):
- """Copy TreeBuilders from the given module into this module."""
- # I'm fairly sure this is not the best way to do this.
- this_module = sys.modules[__package__]
- for name in module.__all__:
- obj = getattr(module, name)
-
- if issubclass(obj, TreeBuilder):
- setattr(this_module, name, obj)
- this_module.__all__.append(name)
- # Register the builder while we're at it.
- this_module.builder_registry.register(obj)
-
-# Builders are registered in reverse order of priority, so that custom
-# builder registrations will take precedence. In general, we want
-# html5lib to take precedence over lxml, because it's more reliable.
-try:
- import _lxml
- register_treebuilders_from(_lxml)
-except ImportError:
- # They don't have lxml installed.
- pass
-try:
- import _html5lib
- register_treebuilders_from(_html5lib)
-except ImportError:
- # They don't have html5lib installed.
- pass
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
deleted file mode 100644
index f8a7a40..0000000
--- a/beautifulsoup/builder/_html5lib.py
+++ /dev/null
@@ -1,233 +0,0 @@
-__all__ = [
- 'HTML5TreeBuilder',
- ]
-
-from beautifulsoup.builder import (
- PERMISSIVE,
- HTML,
- HTML_5,
- HTMLTreeBuilder,
- )
-import html5lib
-from html5lib.constants import DataLossWarning
-import warnings
-from beautifulsoup.element import (
- Comment,
- Doctype,
- NavigableString,
- Tag,
- )
-
-class HTML5TreeBuilder(HTMLTreeBuilder):
- """Use html5lib to build a tree."""
-
- features = ['html5lib', PERMISSIVE, HTML_5, HTML]
-
- def prepare_markup(self, markup, user_specified_encoding):
- # Store the user-specified encoding for use later on.
- self.user_specified_encoding = user_specified_encoding
- return markup, None, None
-
- # These methods are defined by Beautiful Soup.
- def feed(self, markup):
- parser = html5lib.HTMLParser(tree=self.create_treebuilder)
- doc = parser.parse(markup, encoding=self.user_specified_encoding)
-
- # Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
- # We need to special-case this because html5lib sets
- # charEncoding to UTF-8 if it gets Unicode input.
- doc.original_encoding = None
- else:
- doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
-
- def create_treebuilder(self, namespaceHTMLElements):
- self.underlying_builder = TreeBuilderForHtml5lib(
- self.soup, namespaceHTMLElements)
- return self.underlying_builder
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'<html><head></head><body>%s</body></html>' % fragment
-
-
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
-
- def __init__(self, soup, namespaceHTMLElements):
- self.soup = soup
- if namespaceHTMLElements:
- warnings.warn("namespaceHTMLElements not supported yet",
- DataLossWarning)
- super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
-
- def documentClass(self):
- self.soup.reset()
- return Element(self.soup, self.soup, None)
-
- def insertDoctype(self, token):
- name = token["name"]
- publicId = token["publicId"]
- systemId = token["systemId"]
-
- doctype = Doctype.for_name_and_ids(name, publicId, systemId)
- self.soup.object_was_parsed(doctype)
-
- def elementClass(self, name, namespace):
- if namespace is not None:
- warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
- return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
-
- def commentClass(self, data):
- return TextNode(Comment(data), self.soup)
-
- def fragmentClass(self):
- self.soup = BeautifulSoup("")
- self.soup.name = "[document_fragment]"
- return Element(self.soup, self.soup, None)
-
- def appendChild(self, node):
- self.soup.insert(len(self.soup.contents), node.element)
-
- def testSerializer(self, element):
- return testSerializer(element)
-
- def getDocument(self):
- return self.soup
-
- def getFragment(self):
- return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
-
-class AttrList(object):
- def __init__(self, element):
- self.element = element
- self.attrs = dict(self.element.attrs)
- def __iter__(self):
- return self.attrs.items().__iter__()
- def __setitem__(self, name, value):
- "set attr", name, value
- self.element[name] = value
- def items(self):
- return self.attrs.items()
- def keys(self):
- return self.attrs.keys()
- def __getitem__(self, name):
- return self.attrs[name]
- def __contains__(self, name):
- return name in self.attrs.keys()
-
-
-class Element(html5lib.treebuilders._base.Node):
- def __init__(self, element, soup, namespace):
- html5lib.treebuilders._base.Node.__init__(self, element.name)
- self.element = element
- self.soup = soup
- self.namespace = namespace
-
- def _nodeIndex(self, node, refNode):
- # Finds a node by identity rather than equality
- for index in range(len(self.element.contents)):
- if id(self.element.contents[index]) == id(refNode.element):
- return index
- return None
-
- def appendChild(self, node):
- if (node.element.__class__ == NavigableString and self.element.contents
- and self.element.contents[-1].__class__ == NavigableString):
- # Concatenate new text onto old text node
- # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
- newStr = NavigableString(self.element.contents[-1]+node.element)
-
- # Remove the old text node
- # (Can't simply use .extract() by itself, because it fails if
- # an equal text node exists within the parent node)
- oldElement = self.element.contents[-1]
- del self.element.contents[-1]
- oldElement.parent = None
- oldElement.extract()
-
- self.element.insert(len(self.element.contents), newStr)
- else:
- self.element.insert(len(self.element.contents), node.element)
- node.parent = self
-
- def getAttributes(self):
- return AttrList(self.element)
-
- def setAttributes(self, attributes):
- if attributes is not None and attributes != {}:
- for name, value in attributes.items():
- self.element[name] = value
- # The attributes may contain variables that need substitution.
- # Call set_up_substitutions manually.
- # The Tag constructor calls this method automatically,
- # but html5lib creates a Tag object before setting up
- # the attributes.
- self.element.contains_substitutions = (
- self.soup.builder.set_up_substitutions(
- self.element))
- attributes = property(getAttributes, setAttributes)
-
- def insertText(self, data, insertBefore=None):
- text = TextNode(NavigableString(data), self.soup)
- if insertBefore:
- self.insertBefore(text, insertBefore)
- else:
- self.appendChild(text)
-
- def insertBefore(self, node, refNode):
- index = self._nodeIndex(node, refNode)
- if (node.element.__class__ == NavigableString and self.element.contents
- and self.element.contents[index-1].__class__ == NavigableString):
- # (See comments in appendChild)
- newStr = NavigableString(self.element.contents[index-1]+node.element)
- oldNode = self.element.contents[index-1]
- del self.element.contents[index-1]
- oldNode.parent = None
- oldNode.extract()
-
- self.element.insert(index-1, newStr)
- else:
- self.element.insert(index, node.element)
- node.parent = self
-
- def removeChild(self, node):
- index = self._nodeIndex(node.parent, node)
- del node.parent.element.contents[index]
- node.element.parent = None
- node.element.extract()
- node.parent = None
-
- def reparentChildren(self, newParent):
- while self.element.contents:
- child = self.element.contents[0]
- child.extract()
- if isinstance(child, Tag):
- newParent.appendChild(Element(child, self.soup, namespaces["html"]))
- else:
- newParent.appendChild(TextNode(child, self.soup))
-
- def cloneNode(self):
- node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace)
- for key,value in self.attributes:
- node.attributes[key] = value
- return node
-
- def hasContent(self):
- return self.element.contents
-
- def getNameTuple(self):
- if self.namespace == None:
- return namespaces["html"], self.name
- else:
- return self.namespace, self.name
-
- nameTuple = property(getNameTuple)
-
-class TextNode(Element):
- def __init__(self, element, soup):
- html5lib.treebuilders._base.Node.__init__(self, None)
- self.element = element
- self.soup = soup
-
- def cloneNode(self):
- raise NotImplementedError
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
deleted file mode 100644
index 23ac485..0000000
--- a/beautifulsoup/builder/_lxml.py
+++ /dev/null
@@ -1,108 +0,0 @@
-__all__ = [
- 'LXMLTreeBuilderForXML',
- 'LXMLTreeBuilder',
- ]
-
-from lxml import etree
-from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import (
- FAST,
- HTML,
- HTMLTreeBuilder,
- PERMISSIVE,
- TreeBuilder,
- XML)
-from beautifulsoup.dammit import UnicodeDammit
-import types
-
-LXML = 'lxml'
-
-class LXMLTreeBuilderForXML(TreeBuilder):
- DEFAULT_PARSER_CLASS = etree.XMLParser
-
- is_xml = True
-
- # Well, it's permissive by XML parser standards.
- features = [LXML, XML, FAST, PERMISSIVE]
-
- @property
- def default_parser(self):
- # This can either return a parser object or a class, which
- # will be instantiated with default arguments.
- return etree.XMLParser(target=self, strip_cdata=False, recover=True)
-
- def __init__(self, parser=None, empty_element_tags=None):
- if empty_element_tags is not None:
- self.empty_element_tags = set(empty_element_tags)
- if parser is None:
- # Use the default parser.
- parser = self.default_parser
- if callable(parser):
- # Instantiate the parser with default arguments
- parser = parser(target=self, strip_cdata=False)
- self.parser = parser
- self.soup = None
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- """
- :return: A 3-tuple (markup, original encoding, encoding
- declared within markup).
- """
- if isinstance(markup, unicode):
- return markup, None, None
-
- try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding)
-
- def feed(self, markup):
- self.parser.feed(markup)
- self.parser.close()
-
- def close(self):
- pass
-
- def start(self, name, attrs):
- self.soup.handle_starttag(name, attrs)
-
- def end(self, name):
- self.soup.endData()
- completed_tag = self.soup.tagStack[-1]
- self.soup.handle_endtag(name)
-
- def pi(self, target, data):
- pass
-
- def data(self, content):
- self.soup.handle_data(content)
-
- def doctype(self, name, pubid, system):
- self.soup.endData()
- doctype = Doctype.for_name_and_ids(name, pubid, system)
- self.soup.object_was_parsed(doctype)
-
- def comment(self, content):
- "Handle comments as Comment objects."
- self.soup.endData()
- self.soup.handle_data(content)
- self.soup.endData(Comment)
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment
-
-
-class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
-
- features = [LXML, HTML, FAST]
- is_xml = False
-
- @property
- def default_parser(self):
- return etree.HTMLParser
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'<html><body>%s</body></html>' % fragment