diff options
Diffstat (limited to 'beautifulsoup/builder')
-rw-r--r-- | beautifulsoup/builder/__init__.py | 259 | ||||
-rw-r--r-- | beautifulsoup/builder/_html5lib.py | 233 | ||||
-rw-r--r-- | beautifulsoup/builder/_lxml.py | 108 |
3 files changed, 0 insertions, 600 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py deleted file mode 100644 index 10c6b7f..0000000 --- a/beautifulsoup/builder/__init__.py +++ /dev/null @@ -1,259 +0,0 @@ -from collections import defaultdict -import re -import sys - -__all__ = [ - 'HTMLTreeBuilder', - 'SAXTreeBuilder', - 'TreeBuilder', - 'TreeBuilderRegistry', - ] - -# Some useful features for a TreeBuilder to have. -FAST = 'fast' -PERMISSIVE = 'permissive' -XML = 'xml' -HTML = 'html' -HTML_5 = 'html5' - - -class TreeBuilderRegistry(object): - - def __init__(self): - self.builders_for_feature = defaultdict(list) - self.builders = [] - - def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features.""" - for feature in treebuilder_class.features: - self.builders_for_feature[feature].insert(0, treebuilder_class) - self.builders.insert(0, treebuilder_class) - - def lookup(self, *features): - if len(self.builders) == 0: - # There are no builders at all. - return None - - if len(features) == 0: - # They didn't ask for any features. Give them the most - # recently registered builder. - return self.builders[0] - - # Go down the list of features in order, and eliminate any builders - # that don't match every feature. - features = list(features) - features.reverse() - candidates = None - candidate_set = None - while len(features) > 0: - feature = features.pop() - we_have_the_feature = self.builders_for_feature.get(feature, []) - if len(we_have_the_feature) > 0: - if candidates is None: - candidates = we_have_the_feature - candidate_set = set(candidates) - else: - # Eliminate any candidates that don't have this feature. - candidate_set = candidate_set.intersection( - set(we_have_the_feature)) - - # The only valid candidates are the ones in candidate_set. - # Go through the original list of candidates and pick the first one - # that's in candidate_set. - if candidate_set is None: - return None - for candidate in candidates: - if candidate in candidate_set: - return candidate - return None - -# The BeautifulSoup class will take feature lists from developers and use them -# to look up builders in this registry. -builder_registry = TreeBuilderRegistry() - - -class TreeBuilder(object): - """Turn a document into a Beautiful Soup object tree.""" - - features = [] - - is_xml = False - preserve_whitespace_tags = set() - empty_element_tags = None # A tag will be considered an empty-element - # tag when and only when it has no contents. - - def __init__(self): - self.soup = None - - def reset(self): - pass - - def can_be_empty_element(self, tag_name): - """Might a tag with this name be an empty-element tag? - - The final markup may or may not actually present this tag as - self-closing. - - For instance: an HTMLBuilder does not consider a <p> tag to be - an empty-element tag (it's not in - HTMLBuilder.empty_element_tags). This means an empty <p> tag - will be presented as "<p></p>", not "<p />". - - The default implementation has no opinion about which tags are - empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no contents. - "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will - be left alone. - """ - if self.empty_element_tags is None: - return True - return tag_name in self.empty_element_tags - - def feed(self, markup): - raise NotImplementedError() - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - return markup, None, None - - def test_fragment_to_document(self, fragment): - """Wrap an HTML fragment to make it look like a document. - - Different parsers do this differently. For instance, lxml - introduces an empty <head> tag, and html5lib - doesn't. Abstracting this away lets us write simple tests - which run HTML fragments through the parser and compare the - results against other HTML fragments. - - This method should not be used outside of tests. - """ - return fragment - - def set_up_substitutions(self, tag): - pass - - -class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events.""" - - def feed(self, markup): - raise NotImplementedError() - - def close(self): - pass - - def startElement(self, name, attrs): - attrs = dict((key[1], value) for key, value in attrs.items()) - #print "Start %s, %r" % (name, attrs) - self.soup.handle_starttag(name, attrs) - - def endElement(self, name): - #print "End %s" % name - self.soup.handle_endtag(name) - - def startElementNS(self, nsTuple, nodeName, attrs): - # Throw away (ns, nodeName) for now. - self.startElement(nodeName, attrs) - - def endElementNS(self, nsTuple, nodeName): - # Throw away (ns, nodeName) for now. - self.endElement(nodeName) - #handler.endElementNS((ns, node.nodeName), node.nodeName) - - def startPrefixMapping(self, prefix, nodeValue): - # Ignore the prefix for now. - pass - - def endPrefixMapping(self, prefix): - # Ignore the prefix for now. - # handler.endPrefixMapping(prefix) - pass - - def characters(self, content): - self.soup.handle_data(content) - - def startDocument(self): - pass - - def endDocument(self): - pass - - -class HTMLTreeBuilder(TreeBuilder): - """This TreeBuilder knows facts about HTML. - - Such as which tags are empty-element tags. - """ - - preserve_whitespace_tags = set(['pre', 'textarea']) - empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) - - # Used by set_up_substitutions to detect the charset in a META tag - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - - def set_up_substitutions(self, tag): - if tag.name != 'meta': - return False - - http_equiv = tag.get('http-equiv') - content = tag.get('content') - - if (http_equiv is not None - and content is not None - and http_equiv.lower() == 'content-type'): - # This is an interesting meta tag. - match = self.CHARSET_RE.search(content) - if match: - if (self.soup.declared_html_encoding is not None or - self.soup.original_encoding == self.soup.from_encoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - tag['content'] = self.CHARSET_RE.sub(rewrite, content) - return True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - new_charset = match.group(3) - if (new_charset is not None - and new_charset != self.soup.original_encoding): - self.soup.declared_html_encoding = new_charset - self.soup._feed(self.soup.declared_html_encoding) - raise StopParsing - pass - return False - - -def register_treebuilders_from(module): - """Copy TreeBuilders from the given module into this module.""" - # I'm fairly sure this is not the best way to do this. - this_module = sys.modules[__package__] - for name in module.__all__: - obj = getattr(module, name) - - if issubclass(obj, TreeBuilder): - setattr(this_module, name, obj) - this_module.__all__.append(name) - # Register the builder while we're at it. - this_module.builder_registry.register(obj) - -# Builders are registered in reverse order of priority, so that custom -# builder registrations will take precedence. In general, we want -# html5lib to take precedence over lxml, because it's more reliable. -try: - import _lxml - register_treebuilders_from(_lxml) -except ImportError: - # They don't have lxml installed. - pass -try: - import _html5lib - register_treebuilders_from(_html5lib) -except ImportError: - # They don't have html5lib installed. - pass diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py deleted file mode 100644 index f8a7a40..0000000 --- a/beautifulsoup/builder/_html5lib.py +++ /dev/null @@ -1,233 +0,0 @@ -__all__ = [ - 'HTML5TreeBuilder', - ] - -from beautifulsoup.builder import ( - PERMISSIVE, - HTML, - HTML_5, - HTMLTreeBuilder, - ) -import html5lib -from html5lib.constants import DataLossWarning -import warnings -from beautifulsoup.element import ( - Comment, - Doctype, - NavigableString, - Tag, - ) - -class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" - - features = ['html5lib', PERMISSIVE, HTML_5, HTML] - - def prepare_markup(self, markup, user_specified_encoding): - # Store the user-specified encoding for use later on. - self.user_specified_encoding = user_specified_encoding - return markup, None, None - - # These methods are defined by Beautiful Soup. - def feed(self, markup): - parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) - - # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): - # We need to special-case this because html5lib sets - # charEncoding to UTF-8 if it gets Unicode input. - doc.original_encoding = None - else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] - - def create_treebuilder(self, namespaceHTMLElements): - self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) - return self.underlying_builder - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'<html><head></head><body>%s</body></html>' % fragment - - -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): - - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup - if namespaceHTMLElements: - warnings.warn("namespaceHTMLElements not supported yet", - DataLossWarning) - super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) - - def documentClass(self): - self.soup.reset() - return Element(self.soup, self.soup, None) - - def insertDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - - doctype = Doctype.for_name_and_ids(name, publicId, systemId) - self.soup.object_was_parsed(doctype) - - def elementClass(self, name, namespace): - if namespace is not None: - warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) - return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace) - - def commentClass(self, data): - return TextNode(Comment(data), self.soup) - - def fragmentClass(self): - self.soup = BeautifulSoup("") - self.soup.name = "[document_fragment]" - return Element(self.soup, self.soup, None) - - def appendChild(self, node): - self.soup.insert(len(self.soup.contents), node.element) - - def testSerializer(self, element): - return testSerializer(element) - - def getDocument(self): - return self.soup - - def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element - -class AttrList(object): - def __init__(self, element): - self.element = element - self.attrs = dict(self.element.attrs) - def __iter__(self): - return self.attrs.items().__iter__() - def __setitem__(self, name, value): - "set attr", name, value - self.element[name] = value - def items(self): - return self.attrs.items() - def keys(self): - return self.attrs.keys() - def __getitem__(self, name): - return self.attrs[name] - def __contains__(self, name): - return name in self.attrs.keys() - - -class Element(html5lib.treebuilders._base.Node): - def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) - self.element = element - self.soup = soup - self.namespace = namespace - - def _nodeIndex(self, node, refNode): - # Finds a node by identity rather than equality - for index in range(len(self.element.contents)): - if id(self.element.contents[index]) == id(refNode.element): - return index - return None - - def appendChild(self, node): - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[-1].__class__ == NavigableString): - # Concatenate new text onto old text node - # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") - newStr = NavigableString(self.element.contents[-1]+node.element) - - # Remove the old text node - # (Can't simply use .extract() by itself, because it fails if - # an equal text node exists within the parent node) - oldElement = self.element.contents[-1] - del self.element.contents[-1] - oldElement.parent = None - oldElement.extract() - - self.element.insert(len(self.element.contents), newStr) - else: - self.element.insert(len(self.element.contents), node.element) - node.parent = self - - def getAttributes(self): - return AttrList(self.element) - - def setAttributes(self, attributes): - if attributes is not None and attributes != {}: - for name, value in attributes.items(): - self.element[name] = value - # The attributes may contain variables that need substitution. - # Call set_up_substitutions manually. - # The Tag constructor calls this method automatically, - # but html5lib creates a Tag object before setting up - # the attributes. - self.element.contains_substitutions = ( - self.soup.builder.set_up_substitutions( - self.element)) - attributes = property(getAttributes, setAttributes) - - def insertText(self, data, insertBefore=None): - text = TextNode(NavigableString(data), self.soup) - if insertBefore: - self.insertBefore(text, insertBefore) - else: - self.appendChild(text) - - def insertBefore(self, node, refNode): - index = self._nodeIndex(node, refNode) - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[index-1].__class__ == NavigableString): - # (See comments in appendChild) - newStr = NavigableString(self.element.contents[index-1]+node.element) - oldNode = self.element.contents[index-1] - del self.element.contents[index-1] - oldNode.parent = None - oldNode.extract() - - self.element.insert(index-1, newStr) - else: - self.element.insert(index, node.element) - node.parent = self - - def removeChild(self, node): - index = self._nodeIndex(node.parent, node) - del node.parent.element.contents[index] - node.element.parent = None - node.element.extract() - node.parent = None - - def reparentChildren(self, newParent): - while self.element.contents: - child = self.element.contents[0] - child.extract() - if isinstance(child, Tag): - newParent.appendChild(Element(child, self.soup, namespaces["html"])) - else: - newParent.appendChild(TextNode(child, self.soup)) - - def cloneNode(self): - node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace) - for key,value in self.attributes: - node.attributes[key] = value - return node - - def hasContent(self): - return self.element.contents - - def getNameTuple(self): - if self.namespace == None: - return namespaces["html"], self.name - else: - return self.namespace, self.name - - nameTuple = property(getNameTuple) - -class TextNode(Element): - def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) - self.element = element - self.soup = soup - - def cloneNode(self): - raise NotImplementedError diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py deleted file mode 100644 index 23ac485..0000000 --- a/beautifulsoup/builder/_lxml.py +++ /dev/null @@ -1,108 +0,0 @@ -__all__ = [ - 'LXMLTreeBuilderForXML', - 'LXMLTreeBuilder', - ] - -from lxml import etree -from beautifulsoup.element import Comment, Doctype -from beautifulsoup.builder import ( - FAST, - HTML, - HTMLTreeBuilder, - PERMISSIVE, - TreeBuilder, - XML) -from beautifulsoup.dammit import UnicodeDammit -import types - -LXML = 'lxml' - -class LXMLTreeBuilderForXML(TreeBuilder): - DEFAULT_PARSER_CLASS = etree.XMLParser - - is_xml = True - - # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] - - @property - def default_parser(self): - # This can either return a parser object or a class, which - # will be instantiated with default arguments. - return etree.XMLParser(target=self, strip_cdata=False, recover=True) - - def __init__(self, parser=None, empty_element_tags=None): - if empty_element_tags is not None: - self.empty_element_tags = set(empty_element_tags) - if parser is None: - # Use the default parser. - parser = self.default_parser - if callable(parser): - # Instantiate the parser with default arguments - parser = parser(target=self, strip_cdata=False) - self.parser = parser - self.soup = None - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 3-tuple (markup, original encoding, encoding - declared within markup). - """ - if isinstance(markup, unicode): - return markup, None, None - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, isHTML=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding) - - def feed(self, markup): - self.parser.feed(markup) - self.parser.close() - - def close(self): - pass - - def start(self, name, attrs): - self.soup.handle_starttag(name, attrs) - - def end(self, name): - self.soup.endData() - completed_tag = self.soup.tagStack[-1] - self.soup.handle_endtag(name) - - def pi(self, target, data): - pass - - def data(self, content): - self.soup.handle_data(content) - - def doctype(self, name, pubid, system): - self.soup.endData() - doctype = Doctype.for_name_and_ids(name, pubid, system) - self.soup.object_was_parsed(doctype) - - def comment(self, content): - "Handle comments as Comment objects." - self.soup.endData() - self.soup.handle_data(content) - self.soup.endData(Comment) - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment - - -class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - - features = [LXML, HTML, FAST] - is_xml = False - - @property - def default_parser(self): - return etree.HTMLParser - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment |