Made it easier to convert BS3 code to BS4.

author: Leonard Richardson <leonard.richardson@canonical.com> 2012-01-20 16:18:45 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2012-01-20 16:18:45 -0500
commit: 703ee4a184e491be056ae5c4c7549e004be12622 (patch)
tree: 4dd26ef0757cae50fa9bfeb4a3a216a9319785a6
parent: df26dc64d868875d7cd8ca550f1a174d68dd7c67 (diff)
4 files changed, 105 insertions, 5 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 5bd3b83..07795b9 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -25,11 +25,23 @@ __license__ = "MIT"
 __all__ = ['BeautifulSoup']
 
 import re
+import warnings
 
 from .builder import builder_registry
 from .dammit import UnicodeDammit
-from .element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag
-
+from .element import (
+    CData,
+    Comment,
+    DEFAULT_OUTPUT_ENCODING,
+    Declaration,
+    Doctype,
+    NavigableString,
+    PageElement,
+    ProcessingInstruction,
+    ResultSet,
+    SoupStrainer,
+    Tag,
+    )
 
 class BeautifulSoup(Tag):
     """
@@ -66,11 +78,67 @@ class BeautifulSoup(Tag):
     STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
 
     def __init__(self, markup="", features=None, builder=None,
-                 parse_only=None, from_encoding=None):
+                 parse_only=None, from_encoding=None, **kwargs):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
         is fed into the underlying parser."""
 
+        if 'convertEntities' in kwargs:
+            warnings.warn(
+                "BS4 does not respect the convertEntities argument to the "
+                "BeautifulSoup constructor. Entities are always converted "
+                "to Unicode characters.")
+
+        if 'markupMassage' in kwargs:
+            del kwargs['markupMassage']
+            warnings.warn(
+                "BS4 does not respect the markupMassage argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for any necessary markup massage.")
+
+        if 'smartQuotesTo' in kwargs:
+            del kwargs['smartQuotesTo']
+            warnings.warn(
+                "BS4 does not respect the smartQuotesTo argument to the "
+                "BeautifulSoup constructor. Smart quotes are always converted "
+                "to Unicode characters.")
+
+        if 'selfClosingTags' in kwargs:
+            del kwargs['selfClosingTags']
+            warnings.warn(
+                "BS4 does not respect the selfClosingTags argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for understanding self-closing tags.")
+
+        if 'isHTML' in kwargs:
+            del kwargs['isHTML']
+            warnings.warn(
+                "BS4 does not respect the isHTML argument to the "
+                "BeautifulSoup constructor. You can pass in features='html' "
+                "or features='xml' to get a builder capable of handling "
+                "one or the other.")
+
+        def deprecated_argument(old_name, new_name):
+            if old_name in kwargs:
+                warnings.warn(
+                    'The "%s" argument to the BeautifulSoup constructor '
+                    'has been renamed to "%s."' % (old_name, new_name))
+                value = kwargs[old_name]
+                del kwargs[old_name]
+                return value
+            return None
+
+        parse_only = parse_only or deprecated_argument(
+            "parseOnlyThese", "parse_only")
+
+        from_encoding = from_encoding or deprecated_argument(
+            "fromEncoding", "from_encoding")
+
+        if len(kwargs) > 0:
+            arg = kwargs.keys().pop()
+            raise TypeError(
+                "__init__() got an unexpected keyword argument '%s'" % arg)
+
         if builder is None:
             if isinstance(features, basestring):
                 features = [features]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index f9476cd..53374f0 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -65,7 +65,13 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
         self.soup.handle_data(data)
 
     def handle_charref(self, name):
-        self.handle_data(unichr(int(name)))
+        # XXX workaround for a bug in HTMLParser. Remove this once
+        # it's fixed.
+        if name.startswith('x'):
+            data = unichr(int(name.lstrip('x'), 16))
+        else:
+            data = unichr(int(name))
+        self.handle_data(data)
 
     def handle_entityref(self, name):
         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index d2db38e..8aa2471 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -44,6 +44,10 @@ class TestHTMLParserTreeBuilder(TestLXMLBuilder):
         self.assertSoupEquals(
             "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
 
+    def test_hex_entities_in_text(self):
+        # XXX This tests a workaround for a bug in HTMLParser.
+        self.assertSoupEquals("&#xf1;", u"\xf1")
+
     def test_entities_in_attribute_values_converted_during_parsing(self):
 
         # The numeric entity isn't recognized without the closing
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 404a468..b588561 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -5,7 +5,29 @@ import unittest
 from bs4.element import SoupStrainer
 from bs4.dammit import EntitySubstitution, UnicodeDammit
 from bs4.testing import SoupTest
-
+import warnings
+
+class TestDeprecatedConstructorArguments(SoupTest):
+
+    def test_parseOnlyThese_renamed_to_parse_only(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
+        msg = str(w[0].message)
+        self.assertTrue("parseOnlyThese" in msg)
+        self.assertTrue("parse_only" in msg)
+        self.assertEquals("<b></b>", soup.encode())
+
+    def test_fromEncoding_renamed_to_from_encoding(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a>", fromEncoding=("shift_jis"))
+        msg = str(w[0].message)
+        self.assertTrue("fromEncoding" in msg)
+        self.assertTrue("from_encoding" in msg)
+        self.assertEquals("shift_jis", soup.original_encoding)
+
+    def test_unrecognized_keyword_argument(self):
+        self.assertRaises(
+            TypeError, self.soup, "<a>", no_such_argument=True)
 
 class TestSelectiveParsing(SoupTest):
author	Leonard Richardson <leonard.richardson@canonical.com>	2012-01-20 16:18:45 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2012-01-20 16:18:45 -0500
commit	703ee4a184e491be056ae5c4c7549e004be12622 (patch)
tree	4dd26ef0757cae50fa9bfeb4a3a216a9319785a6
parent	df26dc64d868875d7cd8ca550f1a174d68dd7c67 (diff)