summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2012-01-20 16:18:45 -0500
committerLeonard Richardson <leonard.richardson@canonical.com>2012-01-20 16:18:45 -0500
commit703ee4a184e491be056ae5c4c7549e004be12622 (patch)
tree4dd26ef0757cae50fa9bfeb4a3a216a9319785a6
parentdf26dc64d868875d7cd8ca550f1a174d68dd7c67 (diff)
Made it easier to convert BS3 code to BS4.
-rw-r--r--bs4/__init__.py74
-rw-r--r--bs4/builder/_htmlparser.py8
-rw-r--r--bs4/tests/test_htmlparser.py4
-rw-r--r--bs4/tests/test_soup.py24
4 files changed, 105 insertions, 5 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 5bd3b83..07795b9 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -25,11 +25,23 @@ __license__ = "MIT"
__all__ = ['BeautifulSoup']
import re
+import warnings
from .builder import builder_registry
from .dammit import UnicodeDammit
-from .element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag
-
+from .element import (
+ CData,
+ Comment,
+ DEFAULT_OUTPUT_ENCODING,
+ Declaration,
+ Doctype,
+ NavigableString,
+ PageElement,
+ ProcessingInstruction,
+ ResultSet,
+ SoupStrainer,
+ Tag,
+ )
class BeautifulSoup(Tag):
"""
@@ -66,11 +78,67 @@ class BeautifulSoup(Tag):
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", features=None, builder=None,
- parse_only=None, from_encoding=None):
+ parse_only=None, from_encoding=None, **kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
+ if 'convertEntities' in kwargs:
+ warnings.warn(
+ "BS4 does not respect the convertEntities argument to the "
+ "BeautifulSoup constructor. Entities are always converted "
+ "to Unicode characters.")
+
+ if 'markupMassage' in kwargs:
+ del kwargs['markupMassage']
+ warnings.warn(
+ "BS4 does not respect the markupMassage argument to the "
+ "BeautifulSoup constructor. The tree builder is responsible "
+ "for any necessary markup massage.")
+
+ if 'smartQuotesTo' in kwargs:
+ del kwargs['smartQuotesTo']
+ warnings.warn(
+ "BS4 does not respect the smartQuotesTo argument to the "
+ "BeautifulSoup constructor. Smart quotes are always converted "
+ "to Unicode characters.")
+
+ if 'selfClosingTags' in kwargs:
+ del kwargs['selfClosingTags']
+ warnings.warn(
+ "BS4 does not respect the selfClosingTags argument to the "
+ "BeautifulSoup constructor. The tree builder is responsible "
+ "for understanding self-closing tags.")
+
+ if 'isHTML' in kwargs:
+ del kwargs['isHTML']
+ warnings.warn(
+ "BS4 does not respect the isHTML argument to the "
+ "BeautifulSoup constructor. You can pass in features='html' "
+ "or features='xml' to get a builder capable of handling "
+ "one or the other.")
+
+ def deprecated_argument(old_name, new_name):
+ if old_name in kwargs:
+ warnings.warn(
+ 'The "%s" argument to the BeautifulSoup constructor '
+ 'has been renamed to "%s."' % (old_name, new_name))
+ value = kwargs[old_name]
+ del kwargs[old_name]
+ return value
+ return None
+
+ parse_only = parse_only or deprecated_argument(
+ "parseOnlyThese", "parse_only")
+
+ from_encoding = from_encoding or deprecated_argument(
+ "fromEncoding", "from_encoding")
+
+ if len(kwargs) > 0:
+ arg = kwargs.keys().pop()
+ raise TypeError(
+ "__init__() got an unexpected keyword argument '%s'" % arg)
+
if builder is None:
if isinstance(features, basestring):
features = [features]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index f9476cd..53374f0 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -65,7 +65,13 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
self.soup.handle_data(data)
def handle_charref(self, name):
- self.handle_data(unichr(int(name)))
+ # XXX workaround for a bug in HTMLParser. Remove this once
+ # it's fixed.
+ if name.startswith('x'):
+ data = unichr(int(name.lstrip('x'), 16))
+ else:
+ data = unichr(int(name))
+ self.handle_data(data)
def handle_entityref(self, name):
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index d2db38e..8aa2471 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -44,6 +44,10 @@ class TestHTMLParserTreeBuilder(TestLXMLBuilder):
self.assertSoupEquals(
"<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>")
+ def test_hex_entities_in_text(self):
+ # XXX This tests a workaround for a bug in HTMLParser.
+ self.assertSoupEquals("&#xf1;", u"\xf1")
+
def test_entities_in_attribute_values_converted_during_parsing(self):
# The numeric entity isn't recognized without the closing
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 404a468..b588561 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -5,7 +5,29 @@ import unittest
from bs4.element import SoupStrainer
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import SoupTest
-
+import warnings
+
+class TestDeprecatedConstructorArguments(SoupTest):
+
+ def test_parseOnlyThese_renamed_to_parse_only(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
+ msg = str(w[0].message)
+ self.assertTrue("parseOnlyThese" in msg)
+ self.assertTrue("parse_only" in msg)
+ self.assertEquals("<b></b>", soup.encode())
+
+ def test_fromEncoding_renamed_to_from_encoding(self):
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup("<a>", fromEncoding=("shift_jis"))
+ msg = str(w[0].message)
+ self.assertTrue("fromEncoding" in msg)
+ self.assertTrue("from_encoding" in msg)
+ self.assertEquals("shift_jis", soup.original_encoding)
+
+ def test_unrecognized_keyword_argument(self):
+ self.assertRaises(
+ TypeError, self.soup, "<a>", no_such_argument=True)
class TestSelectiveParsing(SoupTest):