Beautiful Soup will now work with versions of html5lib greater than

0.99999999. [bug=1603299]
author: Leonard Richardson <leonardr@segfault.org> 2016-07-16 22:28:40 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2016-07-16 22:28:40 -0400
commit: f18cbadf256b24912837c8b0d7fd6a2dc1a1d640 (patch)
tree: 9f8459bab4cdcf79ae76abea31025b22ce1cd111
parent: 120f4fcedc825b6c207263858e5bbded60a7886e (diff)
4 files changed, 43 insertions, 9 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 76dedd6..eda6251 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,5 +1,8 @@
 = 4.5.0 () =
 
+* Beautiful Soup will now work with versions of html5lib greater than
+  0.99999999. [bug=1603299]
+
 * Corrected handling of XML processing instructions. [bug=1504393]
 
 * The contents of <textarea> tags will no longer be modified when the
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 4df3280..80b6d93 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -142,6 +142,10 @@ class BeautifulSoup(Tag):
         from_encoding = from_encoding or deprecated_argument(
             "fromEncoding", "from_encoding")
 
+        if from_encoding and isinstance(markup, unicode):
+            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+            from_encoding = None
+
         if len(kwargs) > 0:
             arg = kwargs.keys().pop()
             raise TypeError(
@@ -184,7 +188,10 @@ class BeautifulSoup(Tag):
 
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
-        elif len(markup) <= 256 and not '<' in markup:
+        elif len(markup) <= 256 and (
+                (isinstance(markup, bytes) and not b'<' in markup)
+                or (isinstance(markup, unicode) and not u'<' in markup)
+        ):
             # Print out warnings for a couple beginner problems
             # involving passing non-markup to Beautiful Soup.
             # Beautiful Soup will still parse the input as markup,
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 03a8355..c46f882 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -25,6 +25,15 @@ from bs4.element import (
     Tag,
     )
 
+try:
+    # Pre-0.99999999
+    from html5lib.treebuilders import _base as treebuilder_base
+    new_html5lib = False
+except ImportError, e:
+    # 0.99999999 and up
+    from html5lib.treebuilders import base as treebuilder_base
+    new_html5lib = True
+
 class HTML5TreeBuilder(HTMLTreeBuilder):
     """Use html5lib to build a tree."""
 
@@ -49,7 +58,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         if self.soup.parse_only is not None:
             warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        extra_kwargs = dict()
+        if not isinstance(markup, unicode):
+            if new_html5lib:
+                extra_kwargs['override_encoding'] = self.user_specified_encoding
+            else:
+                extra_kwargs['encoding'] = self.user_specified_encoding
+        doc = parser.parse(markup, **extra_kwargs)
 
         # Set the character encoding detected by the tokenizer.
         if isinstance(markup, unicode):
@@ -57,7 +73,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
             # charEncoding to UTF-8 if it gets Unicode input.
             doc.original_encoding = None
         else:
-            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            if not isinstance(original_encoding, basestring):
+                # In 0.99999999 and up, the encoding is an html5lib
+                # Encoding object. We want to use a string for compatibility
+                # with other tree builders.
+                original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding
 
     def create_treebuilder(self, namespaceHTMLElements):
         self.underlying_builder = TreeBuilderForHtml5lib(
@@ -69,7 +91,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         return u'<html><head></head><body>%s</body></html>' % fragment
 
 
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
 
     def __init__(self, soup, namespaceHTMLElements):
         self.soup = soup
@@ -107,7 +129,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
         return self.soup
 
     def getFragment(self):
-        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+        return treebuilder_base.TreeBuilder.getFragment(self).element
 
 class AttrList(object):
     def __init__(self, element):
@@ -139,9 +161,9 @@ class AttrList(object):
         return name in list(self.attrs.keys())
 
 
-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
     def __init__(self, element, soup, namespace):
-        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        treebuilder_base.Node.__init__(self, element.name)
         self.element = element
         self.soup = soup
         self.namespace = namespace
@@ -326,7 +348,7 @@ class Element(html5lib.treebuilders._base.Node):
 
 class TextNode(Element):
     def __init__(self, element, soup):
-        html5lib.treebuilders._base.Node.__init__(self, None)
+        treebuilder_base.Node.__init__(self, None)
         self.element = element
         self.soup = soup
 
diff --git a/bs4/testing.py b/bs4/testing.py
index 676d7b3..387f775 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -495,7 +495,9 @@ Hello, world!
         hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
         soup = self.soup(
             hebrew_document, from_encoding="iso8859-8")
-        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        # Some tree builders call it iso8859-8, others call it iso-8859-9.
+        # That's not a difference we really care about.
+        assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
         self.assertEqual(
             soup.encode('utf-8'),
             hebrew_document.decode("iso8859-8").encode("utf-8"))
author	Leonard Richardson <leonardr@segfault.org>	2016-07-16 22:28:40 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2016-07-16 22:28:40 -0400
commit	f18cbadf256b24912837c8b0d7fd6a2dc1a1d640 (patch)
tree	9f8459bab4cdcf79ae76abea31025b22ce1cd111
parent	120f4fcedc825b6c207263858e5bbded60a7886e (diff)