diff options
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 49 |
1 files changed, 36 insertions, 13 deletions
diff --git a/bs4/element.py b/bs4/element.py index 7c72894..b176777 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -29,6 +29,24 @@ class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" + # There are five possible values for the "formatter" argument passed in + # to methods like encode() and prettify(): + # + # "html" - All Unicode characters with corresponding HTML entities + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to + # XML entities: & < > + # None - The null formatter. Unicode characters are never + # converted to entities. This is not recommended, but it's + # faster than "minimal". + # A function - This function will be called on every string that + # needs to undergo entity substition + FORMATTERS = { + "html" : EntitySubstitution.substitute_html, + "minimal" : EntitySubstitution.substitute_xml, + None : None + } + def setup(self, parent=None, previous_element=None): """Sets up the initial relations between this element and other elements.""" @@ -396,11 +414,15 @@ class NavigableString(unicode, PageElement): "'%s' object has no attribute '%s'" % ( self.__class__.__name__, attr)) - def output_ready(self, substitute_html_entities=False): - if substitute_html_entities: - output = EntitySubstitution.substitute_html(self) + def output_ready(self, formatter="minimal"): + if not callable(formatter): + formatter = self.FORMATTERS.get( + formatter, EntitySubstitution.substitute_xml) + if formatter is None: + output = self else: - output = EntitySubstitution.substitute_xml(self) + output = formatter(self) + return self.PREFIX + output + self.SUFFIX @@ -673,13 +695,13 @@ class Tag(PageElement): __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - indent_level=None, substitute_html_entities=False): + indent_level=None, formatter="minimal"): return self.decode(indent_level, encoding, - substitute_html_entities).encode(encoding) + formatter).encode(encoding) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): + formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be @@ -720,7 +742,7 @@ class Tag(PageElement): space = '' indent_contents = None contents = self.decode_contents( - indent_contents, eventual_encoding, substitute_html_entities) + indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. @@ -746,12 +768,13 @@ class Tag(PageElement): s = ''.join(s) return s - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.encode(encoding, True) + def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + return self.encode(encoding, True, formatter) def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): + formatter="minimal"): """Renders the contents of this tag as a Unicode string. :param eventual_encoding: The tag is destined to be @@ -766,10 +789,10 @@ class Tag(PageElement): for c in self: text = None if isinstance(c, NavigableString): - text = c.output_ready(substitute_html_entities) + text = c.output_ready(formatter) elif isinstance(c, Tag): s.append(c.decode(indent_level, eventual_encoding, - substitute_html_entities)) + formatter)) if text and indent_level: text = text.strip() if text: |