# Use of this source code is governed by the MIT license. __license__ = "MIT" try: from collections.abc import Callable # Python 3.6 except ImportError , e: from collections import Callable import re import sys import warnings try: import soupsieve except ImportError, e: soupsieve = None warnings.warn( 'The soupsieve package is not installed. CSS selectors cannot be used.' ) from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) nonwhitespace_re = re.compile(r"\S+") # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on # the off chance someone imported it for their own use. whitespace_re = re.compile(r"\s+") def _alias(attr): """Alias one attribute name to another for backward compatibility""" @property def alias(self): return getattr(self, attr) @alias.setter def alias(self): return setattr(self, attr) return alias class NamespacedAttribute(unicode): def __new__(cls, prefix, name, namespace=None): if name is None: obj = unicode.__new__(cls, prefix) elif prefix is None: # Not really namespaced. obj = unicode.__new__(cls, name) else: obj = unicode.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj class AttributeValueWithCharsetSubstitution(unicode): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'charset' attribute. When Beautiful Soup parses the markup '', the value of the 'charset' attribute will be one of these objects. """ def __new__(cls, original_value): obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): return encoding class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): """A generic stand-in for the value of a meta tag's 'content' attribute. When Beautiful Soup parses the markup: The value of the 'content' attribute will be one of these objects. """ CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) def __new__(cls, original_value): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. return unicode.__new__(unicode, original_value) obj = unicode.__new__(cls, original_value) obj.original_value = original_value return obj def encode(self, encoding): def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) class Formatter(EntitySubstitution): """Describes a strategy to use when outputting a parse tree to a string. Some parts of this strategy come from the distinction between HTML4, HTML5, and XML. Others are configurable by the user. """ # Registries of XML and HTML formatters. XML_FORMATTERS = {} HTML_FORMATTERS = {} HTML = 'html' XML = 'xml' HTML_DEFAULTS = dict( cdata_containing_tags=set(["script", "style"]), preformatted_tags=set(["pre"]), ) def _default(self, language, value, kwarg): if value is not None: return value if language == self.XML: return set() return self.HTML_DEFAULTS[kwarg] def __init__( self, language=None, entity_substitution=None, void_element_close_prefix='/', cdata_containing_tags=None, preformatted_tags=None, ): """ :param void_element_close_prefix: By default, represent void elements as rather than """ self.language = language self.entity_substitution = entity_substitution self.void_element_close_prefix = void_element_close_prefix self.cdata_containing_tags = self._default( language, cdata_containing_tags, 'cdata_containing_tags' ) def substitute(self, ns): """Process a string that needs to undergo entity substitution.""" if not self.entity_substitution: return ns if (isinstance(ns, NavigableString) and ns.parent is not None and ns.parent.name in self.cdata_containing_tags): # Do nothing. return ns # Substitute. return self.entity_substitution(ns) def attribute_value(self, value): """Process the value of an attribute.""" return self.substitute(value) def attributes(self, tag): """Reorder a tag's attributes however you want.""" return sorted(tag.attrs.items()) class HTMLFormatter(Formatter): REGISTRY = {} def __init__(self, *args, **kwargs): return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) class XMLFormatter(Formatter): REGISTRY = {} def __init__(self, *args, **kwargs): return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) # Set up aliases for the default formatters. HTMLFormatter.REGISTRY['html'] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_html ) HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_html, void_element_close_prefix = None ) HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_xml ) HTMLFormatter.REGISTRY[None] = HTMLFormatter( entity_substitution=None ) XMLFormatter.REGISTRY["html"] = XMLFormatter( entity_substitution=EntitySubstitution.substitute_html ) XMLFormatter.REGISTRY["minimal"] = XMLFormatter( entity_substitution=EntitySubstitution.substitute_xml ) XMLFormatter.REGISTRY[None] = Formatter( Formatter(Formatter.XML, entity_substitution=None) ) class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" def format_string(self, s, formatter): """Format the given string using the given formatter.""" if formatter is None: return s if not isinstance(formatter, Formatter): formatter = self.formatter_by_name(formatter) output = formatter.substitute(s) return output @property def _is_xml(self): """Is this element part of an XML tree or an HTML tree? This is used when mapping a formatter name ("minimal") to an appropriate function (one that performs entity-substitution on the contents of