diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2012-01-20 16:18:45 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2012-01-20 16:18:45 -0500 |
commit | 703ee4a184e491be056ae5c4c7549e004be12622 (patch) | |
tree | 4dd26ef0757cae50fa9bfeb4a3a216a9319785a6 | |
parent | df26dc64d868875d7cd8ca550f1a174d68dd7c67 (diff) |
Made it easier to convert BS3 code to BS4.
-rw-r--r-- | bs4/__init__.py | 74 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 8 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 4 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 24 |
4 files changed, 105 insertions, 5 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 5bd3b83..07795b9 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -25,11 +25,23 @@ __license__ = "MIT" __all__ = ['BeautifulSoup'] import re +import warnings from .builder import builder_registry from .dammit import UnicodeDammit -from .element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag - +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) class BeautifulSoup(Tag): """ @@ -66,11 +78,67 @@ class BeautifulSoup(Tag): STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None): + parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" + if 'convertEntities' in kwargs: + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. You can pass in features='html' " + "or features='xml' to get a builder capable of handling " + "one or the other.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if len(kwargs) > 0: + arg = kwargs.keys().pop() + raise TypeError( + "__init__() got an unexpected keyword argument '%s'" % arg) + if builder is None: if isinstance(features, basestring): features = [features] diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index f9476cd..53374f0 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -65,7 +65,13 @@ class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder): self.soup.handle_data(data) def handle_charref(self, name): - self.handle_data(unichr(int(name))) + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed. + if name.startswith('x'): + data = unichr(int(name.lstrip('x'), 16)) + else: + data = unichr(int(name)) + self.handle_data(data) def handle_entityref(self, name): character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index d2db38e..8aa2471 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -44,6 +44,10 @@ class TestHTMLParserTreeBuilder(TestLXMLBuilder): self.assertSoupEquals( "<p>Foo<br/>bar</p>", "<p>Foo<br />bar</p>") + def test_hex_entities_in_text(self): + # XXX This tests a workaround for a bug in HTMLParser. + self.assertSoupEquals("ñ", u"\xf1") + def test_entities_in_attribute_values_converted_during_parsing(self): # The numeric entity isn't recognized without the closing diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 404a468..b588561 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -5,7 +5,29 @@ import unittest from bs4.element import SoupStrainer from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import SoupTest - +import warnings + +class TestDeprecatedConstructorArguments(SoupTest): + + def test_parseOnlyThese_renamed_to_parse_only(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) + msg = str(w[0].message) + self.assertTrue("parseOnlyThese" in msg) + self.assertTrue("parse_only" in msg) + self.assertEquals("<b></b>", soup.encode()) + + def test_fromEncoding_renamed_to_from_encoding(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("<a>", fromEncoding=("shift_jis")) + msg = str(w[0].message) + self.assertTrue("fromEncoding" in msg) + self.assertTrue("from_encoding" in msg) + self.assertEquals("shift_jis", soup.original_encoding) + + def test_unrecognized_keyword_argument(self): + self.assertRaises( + TypeError, self.soup, "<a>", no_such_argument=True) class TestSelectiveParsing(SoupTest): |