diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/beautifulsoup/__init__.py | 31 | ||||
-rw-r--r-- | src/beautifulsoup/builder/__init__.py | 4 | ||||
-rw-r--r-- | src/beautifulsoup/testing.py (renamed from src/beautifulsoup/tests/helpers.py) | 0 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_html5lib.py | 12 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_lxml.py | 14 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_soup.py | 27 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_strainer.py | 2 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_tree.py | 2 |
8 files changed, 69 insertions, 23 deletions
diff --git a/src/beautifulsoup/__init__.py b/src/beautifulsoup/__init__.py index 79bb657..8817164 100644 --- a/src/beautifulsoup/__init__.py +++ b/src/beautifulsoup/__init__.py @@ -299,20 +299,26 @@ class BeautifulStoneSoup(Tag): def handleSpecialMetaTag(self, attrs): """Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the - document from the beginning.""" + document from the beginning. Neither lxml nor html5lib does + this, so the feature is still here.""" httpEquiv = None contentType = None contentTypeIndex = None tagNeedsEncodingSubstitution = False - for i in range(0, len(attrs)): - key, value = attrs[i] - key = key.lower() - if key == 'http-equiv': - httpEquiv = value - elif key == 'content': - contentType = value - contentTypeIndex = i + if isinstance(attrs, dict): + httpEquiv = attrs.get('http-equiv') + contentType = attrs.get('content') + else: + # XXX do we need this? + for i in range(0, len(attrs)): + key, value = attrs[i] + key = key.lower() + if key == 'http-equiv': + httpEquiv = value + elif key == 'content': + contentType = value + contentTypeIndex = i if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) @@ -327,8 +333,11 @@ class BeautifulStoneSoup(Tag): def rewrite(match): return match.group(1) + "%SOUP-ENCODING%" newAttr = self.CHARSET_RE.sub(rewrite, contentType) - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], - newAttr) + if isinstance(attrs, dict): + attrs['content'] = newAttr + else: + attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], + newAttr) tagNeedsEncodingSubstitution = True else: # This is our first pass through the document. diff --git a/src/beautifulsoup/builder/__init__.py b/src/beautifulsoup/builder/__init__.py index 8294c0c..2d33a0b 100644 --- a/src/beautifulsoup/builder/__init__.py +++ b/src/beautifulsoup/builder/__init__.py @@ -11,8 +11,8 @@ class TreeBuilder(Entities): assume_html = False smart_quotes_to = Entities.XML_ENTITIES - convert_html_entities = True - convert_xml_entities = True + convert_html_entities = False + convert_xml_entities = False def __init__(self): self.soup = None diff --git a/src/beautifulsoup/tests/helpers.py b/src/beautifulsoup/testing.py index 20d087e..20d087e 100644 --- a/src/beautifulsoup/tests/helpers.py +++ b/src/beautifulsoup/testing.py diff --git a/src/beautifulsoup/tests/test_html5lib.py b/src/beautifulsoup/tests/test_html5lib.py index 4ffd968..7164dac 100644 --- a/src/beautifulsoup/tests/test_html5lib.py +++ b/src/beautifulsoup/tests/test_html5lib.py @@ -1,5 +1,8 @@ -from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder +from beautifulsoup.testing import ( + BuilderInvalidMarkupSmokeTest, + BuilderSmokeTest, +) class TestHTML5Builder(BuilderSmokeTest): @@ -30,4 +33,11 @@ class TestHTML5BuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest): '<table><tbody><tr id="nested"></tr></tbody></table>')) + def test_foo(self): + isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" + soup = self.soup(isolatin) + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) + utf8 = utf8.replace("\xe9", "\xc3\xa9") + + print soup diff --git a/src/beautifulsoup/tests/test_lxml.py b/src/beautifulsoup/tests/test_lxml.py index cd22b6f..b53ee42 100644 --- a/src/beautifulsoup/tests/test_lxml.py +++ b/src/beautifulsoup/tests/test_lxml.py @@ -1,6 +1,9 @@ """Tests to ensure that the lxml tree builder generates good trees.""" -from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest +from beautifulsoup.testing import ( + BuilderInvalidMarkupSmokeTest, + BuilderSmokeTest, +) class TestLXMLBuilder(BuilderSmokeTest): """See `BuilderSmokeTest`.""" @@ -10,6 +13,15 @@ class TestLXMLBuilder(BuilderSmokeTest): self.assertSoupEquals( "A bare string", "<p>A bare string</p>") + def test_foo(self): + isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>""" + soup = self.soup(isolatin) + + utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode()) + utf8 = utf8.replace("\xe9", "\xc3\xa9") + + print soup + class TestLXMLBuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest): """See `BuilderInvalidMarkupSmokeTest`.""" diff --git a/src/beautifulsoup/tests/test_soup.py b/src/beautifulsoup/tests/test_soup.py index d95cba6..ec0394d 100644 --- a/src/beautifulsoup/tests/test_soup.py +++ b/src/beautifulsoup/tests/test_soup.py @@ -2,9 +2,9 @@ """Tests of Beautiful Soup as a whole.""" import unittest -from helpers import SoupTest from beautifulsoup.element import SoupStrainer from beautifulsoup.dammit import UnicodeDammit +from beautifulsoup.testing import SoupTest class TestEncodingConversion(SoupTest): @@ -48,6 +48,15 @@ class TestEncodingConversion(SoupTest): soup_from_unicode = self.soup(self.unicode_data) self.assertEquals(soup_from_unicode.encode('utf-8'), self.utf8_data) + def test_hebrew(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + iso_8859_8= '<HTML><HEAD><TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE></HEAD><BODY><H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\xed\xe5\xec\xf9</BODY></HTML>' + utf8 = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9</body></html>' + soup = self.soup(iso_8859_8, fromEncoding="iso-8859-8") + self.assertEquals(soup.originalEncoding, 'iso-8859-8') + self.assertEquals(soup.encode('utf-8'), utf8) + class TestSelectiveParsing(SoupTest): @@ -58,14 +67,20 @@ class TestSelectiveParsing(SoupTest): self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>") - class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" - def test_smart_quote_replacement(self): - markup = "<foo>\x92</foo>" + def test_smart_quotes_to_xml_entities(self): + markup = "<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) - self.assertEquals(dammit.unicode, "<foo>’</foo>") + self.assertEquals( + dammit.unicode, "<foo>‘’“”</foo>") + + def test_smart_quotes_to_html_entities(self): + markup = "<foo>\x91\x92\x93\x94</foo>" + dammit = UnicodeDammit(markup, smartQuotesTo="html") + self.assertEquals( + dammit.unicode, "<foo>‘’“”</foo>") def test_detect_utf8(self): utf8 = "\xc3\xa9" @@ -87,7 +102,7 @@ class TestUnicodeDammit(unittest.TestCase): def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-1"]) + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEquals(dammit.originalEncoding, 'utf-8') def test_ignore_invalid_codecs(self): diff --git a/src/beautifulsoup/tests/test_strainer.py b/src/beautifulsoup/tests/test_strainer.py index 9a91463..f078935 100644 --- a/src/beautifulsoup/tests/test_strainer.py +++ b/src/beautifulsoup/tests/test_strainer.py @@ -1,7 +1,7 @@ import unittest -from helpers import SoupTest from beautifulsoup import BeautifulSoup from beautifulsoup.element import SoupStrainer +from beautifulsoup.testing import SoupTest class TestSoupStrainer(unittest.TestCase): diff --git a/src/beautifulsoup/tests/test_tree.py b/src/beautifulsoup/tests/test_tree.py index 42430d3..344a462 100644 --- a/src/beautifulsoup/tests/test_tree.py +++ b/src/beautifulsoup/tests/test_tree.py @@ -12,7 +12,7 @@ methods tested here. import re from beautifulsoup import BeautifulSoup from beautifulsoup.element import SoupStrainer, Tag -from helpers import SoupTest +from beautifulsoup.testing import SoupTest class TreeTest(SoupTest): |