summaryrefslogtreecommitdiff
path: root/bs4/element.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/element.py')
-rw-r--r--bs4/element.py88
1 files changed, 83 insertions, 5 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 594ef78..bc6d9c8 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -81,6 +81,38 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
+class HTMLAwareEntitySubstitution(EntitySubstitution):
+
+ """Entity substitution rules that are aware of some HTML quirks.
+
+ Specifically, the contents of <script> and <style> tags should not
+ undergo entity substitution.
+
+ Incoming NavigableString objects are checked to see if they're the
+ direct children of a <script> or <style> tag.
+ """
+
+ cdata_containing_tags = set(["script", "style"])
+
+ @classmethod
+ def _substitute_if_appropriate(cls, ns, f):
+ if (isinstance(ns, NavigableString)
+ and ns.parent is not None
+ and ns.parent.name in cls.cdata_containing_tags):
+ # Do nothing.
+ return ns
+ # Substitute.
+ return f(ns)
+
+ @classmethod
+ def substitute_html(cls, ns):
+ return cls._substitute_if_appropriate(
+ ns, EntitySubstitution.substitute_html)
+
+ @classmethod
+ def substitute_xml(cls, ns):
+ return cls._substitute_if_appropriate(
+ ns, EntitySubstitution.substitute_xml)
class PageElement(object):
"""Contains the navigational information for some part of the page
@@ -97,25 +129,60 @@ class PageElement(object):
# converted to entities. This is not recommended, but it's
# faster than "minimal".
# A function - This function will be called on every string that
- # needs to undergo entity substition
- FORMATTERS = {
+ # needs to undergo entity substitution.
+ #
+
+ # In an HTML document, the default "html" and "minimal" functions
+ # will leave the contents of <script> and <style> tags alone. For
+ # an XML document, all tags will be given the same treatment.
+
+ HTML_FORMATTERS = {
+ "html" : HTMLAwareEntitySubstitution.substitute_html,
+ "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
+ None : None
+ }
+
+ XML_FORMATTERS = {
"html" : EntitySubstitution.substitute_html,
"minimal" : EntitySubstitution.substitute_xml,
None : None
}
- @classmethod
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
if not callable(formatter):
- formatter = self.FORMATTERS.get(
- formatter, EntitySubstitution.substitute_xml)
+ formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
output = formatter(s)
return output
+ @property
+ def _is_xml(self):
+ """Is this element part of an XML tree or an HTML tree?
+
+ This is used when mapping a formatter name ("minimal") to an
+ appropriate function (one that performs entity-substitution on
+ the contents of <script> and <style> tags, or not). It's
+ inefficient, but it should be called very rarely.
+ """
+ if self.parent is None:
+ # This is the top-level object. It should have .is_xml set
+ # from tree creation. If not, take a guess--BS is usually
+ # used on HTML markup.
+ return getattr(self, 'is_xml', False)
+ return self.parent._is_xml
+
+ def _formatter_for_name(self, name):
+ "Look up a formatter function based on its name and the tree."
+ if self._is_xml:
+ return self.XML_FORMATTERS.get(
+ name, EntitySubstitution.substitute_xml)
+ else:
+ return self.HTML_FORMATTERS.get(
+ name, HTMLAwareEntitySubstitution.substitute_xml)
+
def setup(self, parent=None, previous_element=None):
"""Sets up the initial relations between this element and
other elements."""
@@ -981,6 +1048,12 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's
encoding.
"""
+
+ # First off, turn a string formatter into a function. This
+ # will stop the lookup from happening over and over again.
+ if not callable(formatter):
+ formatter = self._formatter_for_name(formatter)
+
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
@@ -1066,6 +1139,11 @@ class Tag(PageElement):
document contains a <META> tag that mentions the document's
encoding.
"""
+ # First off, turn a string formatter into a function. This
+ # will stop the lookup from happening over and over again.
+ if not callable(formatter):
+ formatter = self._formatter_for_name(formatter)
+
pretty_print = (indent_level is not None)
s = []
for c in self: