diff options
Diffstat (limited to 'bs4/element.py')
-rw-r--r-- | bs4/element.py | 102 |
1 files changed, 66 insertions, 36 deletions
diff --git a/bs4/element.py b/bs4/element.py index ecf2b28..b100d18 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,8 +1,10 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __license__ = "MIT" -from pdb import set_trace import collections import re +import shlex import sys import warnings from bs4.dammit import EntitySubstitution @@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): preformatted_tags = set(["pre"]) + preserve_whitespace_tags = set(['pre', 'textarea']) + @classmethod def _substitute_if_appropriate(cls, ns, f): if (isinstance(ns, NavigableString) @@ -169,11 +173,19 @@ class PageElement(object): This is used when mapping a formatter name ("minimal") to an appropriate function (one that performs entity-substitution on - the contents of <script> and <style> tags, or not). It's + the contents of <script> and <style> tags, or not). It can be inefficient, but it should be called very rarely. """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. if self.parent is None: - # This is the top-level object. It should have .is_xml set + # This is the top-level object. It should have .known_xml set # from tree creation. If not, take a guess--BS is usually # used on HTML markup. return getattr(self, 'is_xml', False) @@ -637,7 +649,7 @@ class PageElement(object): return lambda el: el._attr_value_as_string( attribute, '').startswith(value) elif operator == '$': - # string represenation of `attribute` ends with `value` + # string representation of `attribute` ends with `value` return lambda el: el._attr_value_as_string( attribute, '').endswith(value) elif operator == '*': @@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement): PREFIX = '' SUFFIX = '' + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + + known_xml = None + def __new__(cls, value): """Create a new NavigableString. @@ -743,10 +760,16 @@ class CData(PreformattedString): SUFFIX = u']]>' class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" PREFIX = u'<?' SUFFIX = u'>' +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = u'<?' + SUFFIX = u'?>' + class Comment(PreformattedString): PREFIX = u'<!--' @@ -781,7 +804,8 @@ class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def __init__(self, parser=None, builder=None, name=None, namespace=None, - prefix=None, attrs=None, parent=None, previous=None): + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None): "Basic constructor." if parser is None: @@ -795,6 +819,14 @@ class Tag(PageElement): self.name = name self.namespace = namespace self.prefix = prefix + if builder is not None: + preserve_whitespace_tags = builder.preserve_whitespace_tags + else: + if is_xml: + preserve_whitespace_tags = [] + else: + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags + self.preserve_whitespace_tags = preserve_whitespace_tags if attrs is None: attrs = {} elif attrs: @@ -805,6 +837,13 @@ class Tag(PageElement): attrs = dict(attrs) else: attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml self.attrs = attrs self.contents = [] self.setup(parent, previous) @@ -824,7 +863,7 @@ class Tag(PageElement): Its contents are a copy of the old Tag's contents. """ clone = type(self)(None, self.builder, self.name, self.namespace, - self.nsprefix, self.attrs) + self.nsprefix, self.attrs, is_xml=self._is_xml) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) for child in self.contents: @@ -997,7 +1036,7 @@ class Tag(PageElement): tag_name, tag_name)) return self.find(tag_name) # We special case contents to avoid recursion. - elif not tag.startswith("__") and not tag=="contents": + elif not tag.startswith("__") and not tag == "contents": return self.find(tag) raise AttributeError( "'%s' object has no attribute '%s'" % (self.__class__, tag)) @@ -1057,10 +1096,11 @@ class Tag(PageElement): def _should_pretty_print(self, indent_level): """Should this tag be pretty-printed?""" + return ( - indent_level is not None and - (self.name not in HTMLAwareEntitySubstitution.preformatted_tags - or self._is_xml)) + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, @@ -1280,6 +1320,7 @@ class Tag(PageElement): _selector_combinators = ['>', '+', '~'] _select_debug = False + quoted_colon = re.compile('"[^"]*:[^"]*"') def select_one(self, selector): """Perform a CSS selection operation on the current element.""" value = self.select(selector, limit=1) @@ -1305,8 +1346,7 @@ class Tag(PageElement): if limit and len(context) >= limit: break return context - - tokens = selector.split() + tokens = shlex.split(selector) current_context = [self] if tokens[-1] in self._selector_combinators: @@ -1358,7 +1398,7 @@ class Tag(PageElement): return classes.issubset(candidate.get('class', [])) checker = classes_match - elif ':' in token: + elif ':' in token and not self.quoted_colon.search(token): # Pseudo-class tag_name, pseudo = token.split(':', 1) if tag_name == '': @@ -1389,11 +1429,8 @@ class Tag(PageElement): self.count += 1 if self.count == self.destination: return True - if self.count > self.destination: - # Stop the generator that's sending us - # these things. - raise StopIteration() - return False + else: + return False checker = Counter(pseudo_value).nth_child_of_type else: raise NotImplementedError( @@ -1498,13 +1535,12 @@ class Tag(PageElement): # don't include it in the context more than once. new_context.append(candidate) new_context_ids.add(id(candidate)) - if limit and len(new_context) >= limit: - break elif self._select_debug: print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) - current_context = new_context + if limit and len(current_context) >= limit: + current_context = current_context[:limit] if self._select_debug: print "Final verdict:" @@ -1668,21 +1704,15 @@ class SoupStrainer(object): if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute # like 'class'. - if (isinstance(match_against, unicode) - and ' ' in match_against): - # A bit of a special case. If they try to match "foo - # bar" on a multivalue attribute's value, only accept - # the literal value "foo bar" - # - # XXX This is going to be pretty slow because we keep - # splitting match_against. But it shouldn't come up - # too often. - return (whitespace_re.split(match_against) == markup) - else: - for item in markup: - if self._matches(item, match_against): - return True - return False + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False if match_against is True: # True matches any non-None value. |