diff options
author | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 22:28:40 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2016-07-16 22:28:40 -0400 |
commit | f18cbadf256b24912837c8b0d7fd6a2dc1a1d640 (patch) | |
tree | 9f8459bab4cdcf79ae76abea31025b22ce1cd111 | |
parent | 120f4fcedc825b6c207263858e5bbded60a7886e (diff) |
Beautiful Soup will now work with versions of html5lib greater than
0.99999999. [bug=1603299]
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/__init__.py | 9 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 36 | ||||
-rw-r--r-- | bs4/testing.py | 4 |
4 files changed, 43 insertions, 9 deletions
@@ -1,5 +1,8 @@ = 4.5.0 () = +* Beautiful Soup will now work with versions of html5lib greater than + 0.99999999. [bug=1603299] + * Corrected handling of XML processing instructions. [bug=1504393] * The contents of <textarea> tags will no longer be modified when the diff --git a/bs4/__init__.py b/bs4/__init__.py index 4df3280..80b6d93 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -142,6 +142,10 @@ class BeautifulSoup(Tag): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") + if from_encoding and isinstance(markup, unicode): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( @@ -184,7 +188,10 @@ class BeautifulSoup(Tag): if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - elif len(markup) <= 256 and not '<' in markup: + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, unicode) and not u'<' in markup) + ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 03a8355..c46f882 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -25,6 +25,15 @@ from bs4.element import ( Tag, ) +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" @@ -49,7 +58,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) + + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): @@ -57,7 +73,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( @@ -69,7 +91,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): return u'<html><head></head><body>%s</body></html>' % fragment -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup @@ -107,7 +129,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return self.soup def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + return treebuilder_base.TreeBuilder.getFragment(self).element class AttrList(object): def __init__(self, element): @@ -139,9 +161,9 @@ class AttrList(object): return name in list(self.attrs.keys()) -class Element(html5lib.treebuilders._base.Node): +class Element(treebuilder_base.Node): def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) + treebuilder_base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace @@ -326,7 +348,7 @@ class Element(html5lib.treebuilders._base.Node): class TextNode(Element): def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) + treebuilder_base.Node.__init__(self, None) self.element = element self.soup = soup diff --git a/bs4/testing.py b/bs4/testing.py index 676d7b3..387f775 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -495,7 +495,9 @@ Hello, world! hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' soup = self.soup( hebrew_document, from_encoding="iso8859-8") - self.assertEqual(soup.original_encoding, 'iso8859-8') + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) |