1 files changed, 82 insertions, 21 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index d35f765..aa818ae 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/
 
 Beautiful Soup uses a pluggable XML or HTML parser to parse a
 (possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
+provides methods and Pythonic idioms that make it easy to navigate,
+search, and modify the parse tree.
 
-Beautiful Soup works with Python 2.6 and up. It works better if lxml
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
 and/or html5lib is installed.
 
 For more than you ever wanted to know about Beautiful Soup, see the
 documentation:
 http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
 """
 
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.4.0"
-__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
+__version__ = "4.5.1"
+__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson"
 __license__ = "MIT"
 
 __all__ = ['BeautifulSoup']
 
 import os
 import re
+import traceback
 import warnings
 
 from .builder import builder_registry, ParserRejectedMarkup
@@ -77,7 +82,7 @@ class BeautifulSoup(Tag):
 
     ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
 
-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
 
     def __init__(self, markup="", features=None, builder=None,
                  parse_only=None, from_encoding=None, exclude_encodings=None,
@@ -137,6 +142,10 @@ class BeautifulSoup(Tag):
         from_encoding = from_encoding or deprecated_argument(
             "fromEncoding", "from_encoding")
 
+        if from_encoding and isinstance(markup, unicode):
+            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+            from_encoding = None
+
         if len(kwargs) > 0:
             arg = kwargs.keys().pop()
             raise TypeError(
@@ -161,19 +170,29 @@ class BeautifulSoup(Tag):
                     markup_type = "XML"
                 else:
                     markup_type = "HTML"
+
+                caller = traceback.extract_stack()[0]
+                filename = caller[0]
+                line_number = caller[1]
                 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+                    filename=filename,
+                    line_number=line_number,
                     parser=builder.NAME,
                     markup_type=markup_type))
 
         self.builder = builder
         self.is_xml = builder.is_xml
+        self.known_xml = self.is_xml
         self.builder.soup = self
 
         self.parse_only = parse_only
 
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
-        elif len(markup) <= 256:
+        elif len(markup) <= 256 and (
+                (isinstance(markup, bytes) and not b'<' in markup)
+                or (isinstance(markup, unicode) and not u'<' in markup)
+        ):
             # Print out warnings for a couple beginner problems
             # involving passing non-markup to Beautiful Soup.
             # Beautiful Soup will still parse the input as markup,
@@ -195,16 +214,10 @@ class BeautifulSoup(Tag):
                 if isinstance(markup, unicode):
                     markup = markup.encode("utf8")
                 warnings.warn(
-                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
-            if markup[:5] == "http:" or markup[:6] == "https:":
-                # TODO: This is ugly but I couldn't get it to work in
-                # Python 3 otherwise.
-                if ((isinstance(markup, bytes) and not b' ' in markup)
-                    or (isinstance(markup, unicode) and not u' ' in markup)):
-                    if isinstance(markup, unicode):
-                        markup = markup.encode("utf8")
-                    warnings.warn(
-                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+                    '"%s" looks like a filename, not markup. You should'
+                    'probably open this file and pass the filehandle into'
+                    'Beautiful Soup.' % markup)
+            self._check_markup_is_url(markup)
 
         for (self.markup, self.original_encoding, self.declared_html_encoding,
          self.contains_replacement_characters) in (
@@ -223,15 +236,52 @@ class BeautifulSoup(Tag):
         self.builder.soup = None
 
     def __copy__(self):
-        return type(self)(self.encode(), builder=self.builder)
+        copy = type(self)(
+            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+        )
+
+        # Although we encoded the tree to UTF-8, that may not have
+        # been the encoding of the original markup. Set the copy's
+        # .original_encoding to reflect the original object's
+        # .original_encoding.
+        copy.original_encoding = self.original_encoding
+        return copy
 
     def __getstate__(self):
         # Frequently a tree builder can't be pickled.
         d = dict(self.__dict__)
         if 'builder' in d and not self.builder.picklable:
-            del d['builder']
+            d['builder'] = None
         return d
 
+    @staticmethod
+    def _check_markup_is_url(markup):
+        """ 
+        Check if markup looks like it's actually a url and raise a warning 
+        if so. Markup can be unicode or str (py2) / bytes (py3).
+        """
+        if isinstance(markup, bytes):
+            space = b' '
+            cant_start_with = (b"http:", b"https:")
+        elif isinstance(markup, unicode):
+            space = u' '
+            cant_start_with = (u"http:", u"https:")
+        else:
+            return
+
+        if any(markup.startswith(prefix) for prefix in cant_start_with):
+            if not space in markup:
+                if isinstance(markup, bytes):
+                    decoded_markup = markup.decode('utf-8', 'replace')
+                else:
+                    decoded_markup = markup
+                warnings.warn(
+                    '"%s" looks like a URL. Beautiful Soup is not an'
+                    ' HTTP client. You should probably use an HTTP client like'
+                    ' requests to get the document behind the URL, and feed'
+                    ' that document to Beautiful Soup.' % decoded_markup
+                )
+
     def _feed(self):
         # Convert the document to Unicode.
         self.builder.reset()
@@ -335,7 +385,18 @@ class BeautifulSoup(Tag):
         if parent.next_sibling:
             # This node is being inserted into an element that has
             # already been parsed. Deal with any dangling references.
-            index = parent.contents.index(o)
+            index = len(parent.contents)-1
+            while index >= 0:
+                if parent.contents[index] is o:
+                    break
+                index -= 1
+            else:
+                raise ValueError(
+                    "Error building tree: supposedly %r was inserted "
+                    "into %r after the fact, but I don't see it!" % (
+                        o, parent
+                    )
+                )
             if index == 0:
                 previous_element = parent
                 previous_sibling = None
@@ -387,7 +448,7 @@ class BeautifulSoup(Tag):
         """Push a start tag on to the stack.
 
         If this method returns None, the tag was rejected by the
-        SoupStrainer. You should proceed as if the tag had not occured
+        SoupStrainer. You should proceed as if the tag had not occurred
         in the document. For instance, if this was a self-closing tag,
         don't call handle_endtag.
         """