summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bs4/element.py41
1 files changed, 30 insertions, 11 deletions
diff --git a/bs4/element.py b/bs4/element.py
index 86126ec..99c9b39 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -175,6 +175,8 @@ class PageElement(object):
the contents of <script> and <style> tags, or not). It's
inefficient, but it should be called very rarely.
"""
+ if self.__is_xml is not None:
+ return self.__is_xml
if self.parent is None:
# This is the top-level object. It should have .is_xml set
# from tree creation. If not, take a guess--BS is usually
@@ -680,6 +682,14 @@ class NavigableString(unicode, PageElement):
PREFIX = ''
SUFFIX = ''
+ # We can't tell just by looking at a string whether it's contained
+ # in an XML document or an HTML document.
+ __is_xml = None
+
+ @property
+ def _is_xml(self):
+ return None
+
def __new__(cls, value):
"""Create a new NavigableString.
@@ -790,7 +800,8 @@ class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
def __init__(self, parser=None, builder=None, name=None, namespace=None,
- prefix=None, attrs=None, parent=None, previous=None):
+ prefix=None, attrs=None, parent=None, previous=None,
+ is_xml=None):
"Basic constructor."
if parser is None:
@@ -804,6 +815,14 @@ class Tag(PageElement):
self.name = name
self.namespace = namespace
self.prefix = prefix
+ if builder is not None:
+ preserve_whitespace_tags = builder.preserve_whitespace_tags
+ else:
+ if is_xml:
+ preserve_whitespace_tags = []
+ else:
+ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
+ self.preserve_whitespace_tags = preserve_whitespace_tags
if attrs is None:
attrs = {}
elif attrs:
@@ -814,6 +833,13 @@ class Tag(PageElement):
attrs = dict(attrs)
else:
attrs = dict(attrs)
+
+ # If possible, determine ahead of time whether this tag is an
+ # XML tag.
+ if builder:
+ self.__is_xml = builder.is_xml
+ else:
+ self.__is_xml = is_xml
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
@@ -833,7 +859,7 @@ class Tag(PageElement):
Its contents are a copy of the old Tag's contents.
"""
clone = type(self)(None, self.builder, self.name, self.namespace,
- self.nsprefix, self.attrs)
+ self.nsprefix, self.attrs, is_xml=self._is_xml)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
for child in self.contents:
@@ -1006,7 +1032,7 @@ class Tag(PageElement):
tag_name, tag_name))
return self.find(tag_name)
# We special case contents to avoid recursion.
- elif not tag.startswith("__") and not tag=="contents":
+ elif not tag.startswith("__") and not tag == "contents":
return self.find(tag)
raise AttributeError(
"'%s' object has no attribute '%s'" % (self.__class__, tag))
@@ -1066,17 +1092,10 @@ class Tag(PageElement):
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?"""
- if self.builder:
- preserve_whitespace_tags = self.builder.preserve_whitespace_tags
- else:
- if self._is_xml:
- preserve_whitespace_tags = []
- else:
- preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
return (
indent_level is not None
- and self.name not in preserve_whitespace_tags
+ and self.name not in self.preserve_whitespace_tags
)
def decode(self, indent_level=None,