1 files changed, 66 insertions, 36 deletions
diff --git a/bs4/element.py b/bs4/element.py
index ecf2b28..b100d18 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1,8 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __license__ = "MIT"
 
-from pdb import set_trace
 import collections
 import re
+import shlex
 import sys
 import warnings
 from bs4.dammit import EntitySubstitution
@@ -99,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution):
 
     preformatted_tags = set(["pre"])
 
+    preserve_whitespace_tags = set(['pre', 'textarea'])
+
     @classmethod
     def _substitute_if_appropriate(cls, ns, f):
         if (isinstance(ns, NavigableString)
@@ -169,11 +173,19 @@ class PageElement(object):
 
         This is used when mapping a formatter name ("minimal") to an
         appropriate function (one that performs entity-substitution on
-        the contents of <script> and <style> tags, or not). It's
+        the contents of <script> and <style> tags, or not). It can be
         inefficient, but it should be called very rarely.
         """
+        if self.known_xml is not None:
+            # Most of the time we will have determined this when the
+            # document is parsed.
+            return self.known_xml
+
+        # Otherwise, it's likely that this element was created by
+        # direct invocation of the constructor from within the user's
+        # Python code.
         if self.parent is None:
-            # This is the top-level object. It should have .is_xml set
+            # This is the top-level object. It should have .known_xml set
             # from tree creation. If not, take a guess--BS is usually
             # used on HTML markup.
             return getattr(self, 'is_xml', False)
@@ -637,7 +649,7 @@ class PageElement(object):
             return lambda el: el._attr_value_as_string(
                 attribute, '').startswith(value)
         elif operator == '$':
-            # string represenation of `attribute` ends with `value`
+            # string representation of `attribute` ends with `value`
             return lambda el: el._attr_value_as_string(
                 attribute, '').endswith(value)
         elif operator == '*':
@@ -677,6 +689,11 @@ class NavigableString(unicode, PageElement):
     PREFIX = ''
     SUFFIX = ''
 
+    # We can't tell just by looking at a string whether it's contained
+    # in an XML document or an HTML document.
+
+    known_xml = None
+
     def __new__(cls, value):
         """Create a new NavigableString.
 
@@ -743,10 +760,16 @@ class CData(PreformattedString):
     SUFFIX = u']]>'
 
 class ProcessingInstruction(PreformattedString):
+    """A SGML processing instruction."""
 
     PREFIX = u'<?'
     SUFFIX = u'>'
 
+class XMLProcessingInstruction(ProcessingInstruction):
+    """An XML processing instruction."""
+    PREFIX = u'<?'
+    SUFFIX = u'?>'
+
 class Comment(PreformattedString):
 
     PREFIX = u'<!--'
@@ -781,7 +804,8 @@ class Tag(PageElement):
     """Represents a found HTML tag with its attributes and contents."""
 
     def __init__(self, parser=None, builder=None, name=None, namespace=None,
-                 prefix=None, attrs=None, parent=None, previous=None):
+                 prefix=None, attrs=None, parent=None, previous=None,
+                 is_xml=None):
         "Basic constructor."
 
         if parser is None:
@@ -795,6 +819,14 @@ class Tag(PageElement):
         self.name = name
         self.namespace = namespace
         self.prefix = prefix
+        if builder is not None:
+            preserve_whitespace_tags = builder.preserve_whitespace_tags
+        else:
+            if is_xml:
+                preserve_whitespace_tags = []
+            else:
+                preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
+        self.preserve_whitespace_tags = preserve_whitespace_tags
         if attrs is None:
             attrs = {}
         elif attrs:
@@ -805,6 +837,13 @@ class Tag(PageElement):
                 attrs = dict(attrs)
         else:
             attrs = dict(attrs)
+
+        # If possible, determine ahead of time whether this tag is an
+        # XML tag.
+        if builder:
+            self.known_xml = builder.is_xml
+        else:
+            self.known_xml = is_xml
         self.attrs = attrs
         self.contents = []
         self.setup(parent, previous)
@@ -824,7 +863,7 @@ class Tag(PageElement):
         Its contents are a copy of the old Tag's contents.
         """
         clone = type(self)(None, self.builder, self.name, self.namespace,
-                           self.nsprefix, self.attrs)
+                           self.nsprefix, self.attrs, is_xml=self._is_xml)
         for attr in ('can_be_empty_element', 'hidden'):
             setattr(clone, attr, getattr(self, attr))
         for child in self.contents:
@@ -997,7 +1036,7 @@ class Tag(PageElement):
                     tag_name, tag_name))
             return self.find(tag_name)
         # We special case contents to avoid recursion.
-        elif not tag.startswith("__") and not tag=="contents":
+        elif not tag.startswith("__") and not tag == "contents":
             return self.find(tag)
         raise AttributeError(
             "'%s' object has no attribute '%s'" % (self.__class__, tag))
@@ -1057,10 +1096,11 @@ class Tag(PageElement):
 
     def _should_pretty_print(self, indent_level):
         """Should this tag be pretty-printed?"""
+
         return (
-            indent_level is not None and
-            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
-             or self._is_xml))
+            indent_level is not None
+            and self.name not in self.preserve_whitespace_tags
+        )
 
     def decode(self, indent_level=None,
                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
@@ -1280,6 +1320,7 @@ class Tag(PageElement):
 
     _selector_combinators = ['>', '+', '~']
     _select_debug = False
+    quoted_colon = re.compile('"[^"]*:[^"]*"')
     def select_one(self, selector):
         """Perform a CSS selection operation on the current element."""
         value = self.select(selector, limit=1)
@@ -1305,8 +1346,7 @@ class Tag(PageElement):
                 if limit and len(context) >= limit:
                     break
             return context
-
-        tokens = selector.split()
+        tokens = shlex.split(selector)
         current_context = [self]
 
         if tokens[-1] in self._selector_combinators:
@@ -1358,7 +1398,7 @@ class Tag(PageElement):
                     return classes.issubset(candidate.get('class', []))
                 checker = classes_match
 
-            elif ':' in token:
+            elif ':' in token and not self.quoted_colon.search(token):
                 # Pseudo-class
                 tag_name, pseudo = token.split(':', 1)
                 if tag_name == '':
@@ -1389,11 +1429,8 @@ class Tag(PageElement):
                             self.count += 1
                             if self.count == self.destination:
                                 return True
-                            if self.count > self.destination:
-                                # Stop the generator that's sending us
-                                # these things.
-                                raise StopIteration()
-                            return False
+                            else:
+                                return False
                     checker = Counter(pseudo_value).nth_child_of_type
                 else:
                     raise NotImplementedError(
@@ -1498,13 +1535,12 @@ class Tag(PageElement):
                             # don't include it in the context more than once.
                             new_context.append(candidate)
                             new_context_ids.add(id(candidate))
-                            if limit and len(new_context) >= limit:
-                                break
                     elif self._select_debug:
                         print "     FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
 
-
             current_context = new_context
+        if limit and len(current_context) >= limit:
+            current_context = current_context[:limit]
 
         if self._select_debug:
             print "Final verdict:"
@@ -1668,21 +1704,15 @@ class SoupStrainer(object):
         if isinstance(markup, list) or isinstance(markup, tuple):
             # This should only happen when searching a multi-valued attribute
             # like 'class'.
-            if (isinstance(match_against, unicode)
-                and ' ' in match_against):
-                # A bit of a special case. If they try to match "foo
-                # bar" on a multivalue attribute's value, only accept
-                # the literal value "foo bar"
-                #
-                # XXX This is going to be pretty slow because we keep
-                # splitting match_against. But it shouldn't come up
-                # too often.
-                return (whitespace_re.split(match_against) == markup)
-            else:
-                for item in markup:
-                    if self._matches(item, match_against):
-                        return True
-                return False
+            for item in markup:
+                if self._matches(item, match_against):
+                    return True
+            # We didn't match any particular value of the multivalue
+            # attribute, but maybe we match the attribute value when
+            # considered as a string.
+            if self._matches(' '.join(markup), match_against):
+                return True
+            return False
 
         if match_against is True:
             # True matches any non-None value.