diff options
-rw-r--r-- | bs4/builder/__init__.py | 18 | ||||
-rw-r--r-- | bs4/element.py | 296 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 18 |
3 files changed, 158 insertions, 174 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index c5e6e84..e087f07 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -7,7 +7,6 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, - HTMLAwareEntitySubstitution, nonwhitespace_re ) @@ -90,7 +89,6 @@ class TreeBuilder(object): is_xml = False picklable = False - preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. @@ -98,9 +96,11 @@ class TreeBuilder(object): # comma-separated list of CDATA, rather than a single CDATA. DEFAULT_CDATA_LIST_ATTRIBUTES = {} + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + USE_DEFAULT = object() - def __init__(self, multi_valued_attributes=USE_DEFAULT): + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): """Constructor. :param multi_valued_attributes: If this is set to None, the @@ -110,14 +110,19 @@ class TreeBuilder(object): for an example. Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-use, so the argument ame + probably doesn't make sense to an end-user, so the argument name is `multi_valued_attributes`. + + :param preserve_whitespace_tags: """ self.soup = None if multi_valued_attributes is self.USE_DEFAULT: multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES self.cdata_list_attributes = multi_valued_attributes - + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now being associated with the TreeBuilder. @@ -253,7 +258,6 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set([ # These are from HTML5. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', @@ -292,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder): "output" : ["for"], } + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + def set_up_substitutions(self, tag): # We are only interested in <meta> tags if tag.name != 'meta': diff --git a/bs4/element.py b/bs4/element.py index e8e48df..a233dcd 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -99,134 +99,114 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) -class HTMLAwareEntitySubstitution(EntitySubstitution): - """Entity substitution rules that are aware of some HTML quirks. +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. - Specifically, the contents of <script> and <style> tags should not - undergo entity substitution. - - Incoming NavigableString objects are checked to see if they're the - direct children of a <script> or <style> tag. + Some parts of this strategy come from the distinction between + HTML4, HTML5, and XML. Others are configurable by the user. """ + # Registries of XML and HTML formatters. + XML_FORMATTERS = {} + HTML_FORMATTERS = {} - cdata_containing_tags = set(["script", "style"]) + HTML = 'html' + XML = 'xml' - preformatted_tags = set(["pre"]) + HTML_DEFAULTS = dict( + cdata_containing_tags=set(["script", "style"]), + preformatted_tags=set(["pre"]), + ) - preserve_whitespace_tags = set(['pre', 'textarea']) + def _default(self, language, value, kwarg): + if value is not None: + return value + if language == self.XML: + return set() + return self.HTML_DEFAULTS[kwarg] + + def __init__( + self, language=None, entity_substitution=None, + void_element_close_prefix='/', cdata_containing_tags=None, + preformatted_tags=None, + ): + """ - @classmethod - def _substitute_if_appropriate(cls, ns, f): + :param void_element_close_prefix: By default, represent void + elements as <tag/> rather than <tag> + """ + self.language = language + self.entity_substitution = entity_substitution + self.void_element_close_prefix = void_element_close_prefix + self.cdata_containing_tags = self._default( + language, cdata_containing_tags, 'cdata_containing_tags' + ) + + def substitute(self, ns): + """Process a string that needs to undergo entity substitution.""" + if not self.entity_substitution: + return ns if (isinstance(ns, NavigableString) and ns.parent is not None - and ns.parent.name in cls.cdata_containing_tags): + and ns.parent.name in self.cdata_containing_tags): # Do nothing. return ns # Substitute. - return f(ns) - - @classmethod - def substitute_html(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_html) + return self.entity_substitution(ns) - @classmethod - def substitute_xml(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_xml) - -class Formatter(object): - """Contains information about how to format a parse tree.""" + def attribute_value(self, value): + """Process the value of an attribute.""" + return self.substitute(value) - # By default, represent void elements as <tag/> rather than <tag> - void_element_close_prefix = '/' - - def substitute(self, *args, **kwargs): - """Transform certain characters into named entities.""" - raise NotImplementedError() - - def sort_attributes(self, attributes): + def attributes(self, tag): """Reorder a tag's attributes however you want.""" - return sorted(attributes.items()) - - + return sorted(tag.attrs.items()) + class HTMLFormatter(Formatter): - """The default HTML formatter.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - -class MinimalHTMLFormatter(Formatter): - """A minimal HTML formatter.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs) + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) -class HTML5Formatter(HTMLFormatter): - """An HTML formatter that omits the slash in a void tag.""" - void_element_close_prefix = None - class XMLFormatter(Formatter): - """Substitute only the essential XML entities.""" - def substitute(self, *args, **kwargs): - return EntitySubstitution.substitute_xml(*args, **kwargs) - -class HTMLXMLFormatter(Formatter): - """Format XML using HTML rules.""" - def substitute(self, *args, **kwargs): - return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs) - + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + +# Set up aliases for the default formatters. +HTMLFormatter.REGISTRY['html'] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html, + void_element_close_prefix = None +) +HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +HTMLFormatter.REGISTRY[None] = HTMLFormatter( + entity_substitution=None +) +XMLFormatter.REGISTRY["html"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +XMLFormatter.REGISTRY["minimal"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +XMLFormatter.REGISTRY[None] = Formatter( + Formatter(Formatter.XML, entity_substitution=None) +) class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" - - # There are five possible values for the "formatter" argument passed in - # to methods like encode() and prettify(): - # - # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "html5" - The same as "html", but empty void tags are represented as - # <tag> rather than <tag/> - # "minimal" - Bare ampersands and angle brackets are converted to - # XML entities: & < > - # None - The null formatter. Unicode characters are never - # converted to entities. This is not recommended, but it's - # faster than "minimal". - # A callable function - it will be called on every string that needs to undergo entity substitution. - # A Formatter instance - Formatter.substitute(string) will be called on every string that - # needs to undergo entity substitution. - # - - # In an HTML document, the default "html", "html5", and "minimal" - # functions will leave the contents of <script> and <style> tags - # alone. For an XML document, all tags will be given the same - # treatment. - - HTML_FORMATTERS = { - "html" : HTMLFormatter(), - "html5" : HTML5Formatter(), - "minimal" : MinimalHTMLFormatter(), - None : None - } - - XML_FORMATTERS = { - "html" : HTMLXMLFormatter(), - "minimal" : XMLFormatter(), - None : None - } - - def format_string(self, s, formatter='minimal'): + + def format_string(self, s, formatter): """Format the given string using the given formatter.""" - if isinstance(formatter, basestring): - formatter = self._formatter_for_name(formatter) if formatter is None: - output = s - else: - if isinstance(formatter, Callable): - # Backwards compatibility -- you used to pass in a formatting method. - output = formatter(s) - else: - output = formatter.substitute(s) + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_by_name(formatter) + output = formatter.substitute(s) return output @property @@ -253,13 +233,6 @@ class PageElement(object): return getattr(self, 'is_xml', False) return self.parent._is_xml - def _formatter_for_name(self, name): - "Look up a formatter function based on its name and the tree." - if self._is_xml: - return self.XML_FORMATTERS.get(name, XMLFormatter()) - else: - return self.HTML_FORMATTERS.get(name, HTMLFormatter()) - def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): """Sets up the initial relations between this element and @@ -765,10 +738,12 @@ class PreformattedString(NavigableString): but the return value will be ignored. """ - def output_ready(self, formatter="minimal"): - """CData strings are passed into the formatter. - But the return value is ignored.""" - self.format_string(self, formatter) + def output_ready(self, formatter=None): + """CData strings are passed into the formatter, purely + for any side effects. The return value is ignored. + """ + if formatter: + self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): @@ -836,14 +811,6 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix - if builder is not None: - preserve_whitespace_tags = builder.preserve_whitespace_tags - else: - if is_xml: - preserve_whitespace_tags = [] - else: - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags - self.preserve_whitespace_tags = preserve_whitespace_tags if attrs is None: attrs = {} elif attrs: @@ -887,6 +854,10 @@ class Tag(PageElement): # (unlike can_be_empty_element), we almost never need # to check this. self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags parserClass = _alias("parser_class") # BS3 @@ -1135,14 +1106,6 @@ class Tag(PageElement): u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) - def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed?""" - - return ( - indent_level is not None - and self.name not in self.preserve_whitespace_tags - ) - def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): @@ -1158,32 +1121,29 @@ class Tag(PageElement): # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable): - formatter = self._formatter_for_name(formatter) + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) attrs = [] - if self.attrs: - if isinstance(formatter, Formatter): - sorted_attrs = formatter.sort_attributes(self.attrs) + for key, val in attributes: + if val is None: + decoded = key else: - sorted_attrs = self.attrs.items() - for key, val in sorted_attrs: - if val is None: - decoded = key - else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, basestring): - val = unicode(val) - elif ( + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, basestring): + val = unicode(val) + elif ( isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None): - val = val.encode(eventual_encoding) - - text = self.format_string(val, formatter) - decoded = ( - unicode(key) + '=' - + EntitySubstitution.quoted_attribute_value(text)) - attrs.append(decoded) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + unicode(key) + '=' + + EntitySubstitution.quoted_attribute_value(text)) + attrs.append(decoded) close = '' closeTag = '' @@ -1192,9 +1152,7 @@ class Tag(PageElement): prefix = self.prefix + ":" if self.is_empty_element: - close = '' - if isinstance(formatter, Formatter): - close = formatter.void_element_close_prefix or close + close = formatter.void_element_close_prefix or '' else: closeTag = '</%s%s>' % (prefix, self.name) @@ -1241,6 +1199,13 @@ class Tag(PageElement): s = ''.join(s) return s + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) + def prettify(self, encoding=None, formatter="minimal"): if encoding is None: return self.decode(True, formatter=formatter) @@ -1267,8 +1232,8 @@ class Tag(PageElement): """ # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable): - formatter = self._formatter_for_name(formatter) + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) pretty_print = (indent_level is not None) s = [] @@ -1289,6 +1254,19 @@ class Tag(PageElement): s.append("\n") return ''.join(s) + def formatter_for_name(self, formatter): + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if callable(formatter): + formatter = c(entity_substitution=formatter) + formatter.custom = True + return formatter + return c.REGISTRY[formatter] + def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index f7c5e2f..ffbc29e 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -24,8 +24,8 @@ from bs4.element import ( CData, Comment, Declaration, - MinimalHTMLFormatter, Doctype, + HTMLFormatter, NavigableString, SoupStrainer, Tag, @@ -1533,7 +1533,7 @@ class TestSubstitutions(SoupTest): # callable is called on every string. self.assertEqual( decoded, - self.document_for(u"<b><FOO></b><b>BAR</b><br>")) + self.document_for(u"<b><FOO></b><b>BAR</b><br/>")) def test_formatter_is_run_on_attribute_values(self): markup = u'<a href="http://a.com?a=b&c=é">e</a>' @@ -1687,10 +1687,10 @@ class TestEncoding(SoupTest): class TestFormatter(SoupTest): def test_sort_attributes(self): - class UnsortedFormatter(MinimalHTMLFormatter): - def sort_attributes(self, attributes): - self.called_with = attributes - for k, v in sorted(attributes.items()): + class UnsortedFormatter(HTMLFormatter): + def attributes(self, tag): + self.called_with = tag + for k, v in sorted(tag.attrs.items()): if k == 'ignore': continue yield k,v @@ -1699,9 +1699,9 @@ class TestFormatter(SoupTest): formatter = UnsortedFormatter() decoded = soup.decode(formatter=formatter) - # sort_attributes() was called with all three attributes. It removed one and - # sorted the other two. - self.assertEquals(formatter.called_with, dict(cval="1", aval="2", ignore="ignored")) + # sort_attributes() was called on the <p> tag. It filtered out one + # attribute and sorted the other two. + self.assertEquals(formatter.called_with, soup.p) self.assertEquals(u'<p aval="2" cval="1"></p>', decoded) |