diff options
-rw-r--r-- | bs4/element.py | 41 |
1 files changed, 30 insertions, 11 deletions
diff --git a/bs4/element.py b/bs4/element.py index 86126ec..99c9b39 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -175,6 +175,8 @@ class PageElement(object): the contents of <script> and <style> tags, or not). It's inefficient, but it should be called very rarely. """ + if self.__is_xml is not None: + return self.__is_xml if self.parent is None: # This is the top-level object. It should have .is_xml set # from tree creation. If not, take a guess--BS is usually @@ -680,6 +682,14 @@ class NavigableString(unicode, PageElement): PREFIX = '' SUFFIX = '' + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + __is_xml = None + + @property + def _is_xml(self): + return None + def __new__(cls, value): """Create a new NavigableString. @@ -790,7 +800,8 @@ class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def __init__(self, parser=None, builder=None, name=None, namespace=None, - prefix=None, attrs=None, parent=None, previous=None): + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None): "Basic constructor." if parser is None: @@ -804,6 +815,14 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix + if builder is not None: + preserve_whitespace_tags = builder.preserve_whitespace_tags + else: + if is_xml: + preserve_whitespace_tags = [] + else: + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags + self.preserve_whitespace_tags = preserve_whitespace_tags if attrs is None: attrs = {} elif attrs: @@ -814,6 +833,13 @@ class Tag(PageElement): attrs = dict(attrs) else: attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.__is_xml = builder.is_xml + else: + self.__is_xml = is_xml self.attrs = attrs self.contents = [] self.setup(parent, previous) @@ -833,7 +859,7 @@ class Tag(PageElement): Its contents are a copy of the old Tag's contents. """ clone = type(self)(None, self.builder, self.name, self.namespace, - self.nsprefix, self.attrs) + self.nsprefix, self.attrs, is_xml=self._is_xml) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) for child in self.contents: @@ -1006,7 +1032,7 @@ class Tag(PageElement): tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. - elif not tag.startswith("__") and not tag=="contents": + elif not tag.startswith("__") and not tag == "contents": return self.find(tag) raise AttributeError( "'%s' object has no attribute '%s'" % (self.__class__, tag)) @@ -1066,17 +1092,10 @@ class Tag(PageElement): def _should_pretty_print(self, indent_level): """Should this tag be pretty-printed?""" - if self.builder: - preserve_whitespace_tags = self.builder.preserve_whitespace_tags - else: - if self._is_xml: - preserve_whitespace_tags = [] - else: - preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags return ( indent_level is not None - and self.name not in preserve_whitespace_tags + and self.name not in self.preserve_whitespace_tags ) def decode(self, indent_level=None, |