summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/beautifulsoup/__init__.py31
-rw-r--r--src/beautifulsoup/builder/__init__.py4
-rw-r--r--src/beautifulsoup/testing.py (renamed from src/beautifulsoup/tests/helpers.py)0
-rw-r--r--src/beautifulsoup/tests/test_html5lib.py12
-rw-r--r--src/beautifulsoup/tests/test_lxml.py14
-rw-r--r--src/beautifulsoup/tests/test_soup.py27
-rw-r--r--src/beautifulsoup/tests/test_strainer.py2
-rw-r--r--src/beautifulsoup/tests/test_tree.py2
8 files changed, 69 insertions, 23 deletions
diff --git a/src/beautifulsoup/__init__.py b/src/beautifulsoup/__init__.py
index 79bb657..8817164 100644
--- a/src/beautifulsoup/__init__.py
+++ b/src/beautifulsoup/__init__.py
@@ -299,20 +299,26 @@ class BeautifulStoneSoup(Tag):
def handleSpecialMetaTag(self, attrs):
"""Beautiful Soup can detect a charset included in a META tag,
try to convert the document to that charset, and re-parse the
- document from the beginning."""
+ document from the beginning. Neither lxml nor html5lib does
+ this, so the feature is still here."""
httpEquiv = None
contentType = None
contentTypeIndex = None
tagNeedsEncodingSubstitution = False
- for i in range(0, len(attrs)):
- key, value = attrs[i]
- key = key.lower()
- if key == 'http-equiv':
- httpEquiv = value
- elif key == 'content':
- contentType = value
- contentTypeIndex = i
+ if isinstance(attrs, dict):
+ httpEquiv = attrs.get('http-equiv')
+ contentType = attrs.get('content')
+ else:
+ # XXX do we need this?
+ for i in range(0, len(attrs)):
+ key, value = attrs[i]
+ key = key.lower()
+ if key == 'http-equiv':
+ httpEquiv = value
+ elif key == 'content':
+ contentType = value
+ contentTypeIndex = i
if httpEquiv and contentType: # It's an interesting meta tag.
match = self.CHARSET_RE.search(contentType)
@@ -327,8 +333,11 @@ class BeautifulStoneSoup(Tag):
def rewrite(match):
return match.group(1) + "%SOUP-ENCODING%"
newAttr = self.CHARSET_RE.sub(rewrite, contentType)
- attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
- newAttr)
+ if isinstance(attrs, dict):
+ attrs['content'] = newAttr
+ else:
+ attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+ newAttr)
tagNeedsEncodingSubstitution = True
else:
# This is our first pass through the document.
diff --git a/src/beautifulsoup/builder/__init__.py b/src/beautifulsoup/builder/__init__.py
index 8294c0c..2d33a0b 100644
--- a/src/beautifulsoup/builder/__init__.py
+++ b/src/beautifulsoup/builder/__init__.py
@@ -11,8 +11,8 @@ class TreeBuilder(Entities):
assume_html = False
smart_quotes_to = Entities.XML_ENTITIES
- convert_html_entities = True
- convert_xml_entities = True
+ convert_html_entities = False
+ convert_xml_entities = False
def __init__(self):
self.soup = None
diff --git a/src/beautifulsoup/tests/helpers.py b/src/beautifulsoup/testing.py
index 20d087e..20d087e 100644
--- a/src/beautifulsoup/tests/helpers.py
+++ b/src/beautifulsoup/testing.py
diff --git a/src/beautifulsoup/tests/test_html5lib.py b/src/beautifulsoup/tests/test_html5lib.py
index 4ffd968..7164dac 100644
--- a/src/beautifulsoup/tests/test_html5lib.py
+++ b/src/beautifulsoup/tests/test_html5lib.py
@@ -1,5 +1,8 @@
-from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest
from beautifulsoup.builder.html5lib_builder import HTML5TreeBuilder
+from beautifulsoup.testing import (
+ BuilderInvalidMarkupSmokeTest,
+ BuilderSmokeTest,
+)
class TestHTML5Builder(BuilderSmokeTest):
@@ -30,4 +33,11 @@ class TestHTML5BuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest):
'<table><tbody><tr id="nested"></tr></tbody></table>'))
+ def test_foo(self):
+ isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
+ soup = self.soup(isolatin)
+ utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
+ utf8 = utf8.replace("\xe9", "\xc3\xa9")
+
+ print soup
diff --git a/src/beautifulsoup/tests/test_lxml.py b/src/beautifulsoup/tests/test_lxml.py
index cd22b6f..b53ee42 100644
--- a/src/beautifulsoup/tests/test_lxml.py
+++ b/src/beautifulsoup/tests/test_lxml.py
@@ -1,6 +1,9 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
-from helpers import BuilderInvalidMarkupSmokeTest, BuilderSmokeTest
+from beautifulsoup.testing import (
+ BuilderInvalidMarkupSmokeTest,
+ BuilderSmokeTest,
+)
class TestLXMLBuilder(BuilderSmokeTest):
"""See `BuilderSmokeTest`."""
@@ -10,6 +13,15 @@ class TestLXMLBuilder(BuilderSmokeTest):
self.assertSoupEquals(
"A bare string", "<p>A bare string</p>")
+ def test_foo(self):
+ isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
+ soup = self.soup(isolatin)
+
+ utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
+ utf8 = utf8.replace("\xe9", "\xc3\xa9")
+
+ print soup
+
class TestLXMLBuilderInvalidMarkup(BuilderInvalidMarkupSmokeTest):
"""See `BuilderInvalidMarkupSmokeTest`."""
diff --git a/src/beautifulsoup/tests/test_soup.py b/src/beautifulsoup/tests/test_soup.py
index d95cba6..ec0394d 100644
--- a/src/beautifulsoup/tests/test_soup.py
+++ b/src/beautifulsoup/tests/test_soup.py
@@ -2,9 +2,9 @@
"""Tests of Beautiful Soup as a whole."""
import unittest
-from helpers import SoupTest
from beautifulsoup.element import SoupStrainer
from beautifulsoup.dammit import UnicodeDammit
+from beautifulsoup.testing import SoupTest
class TestEncodingConversion(SoupTest):
@@ -48,6 +48,15 @@ class TestEncodingConversion(SoupTest):
soup_from_unicode = self.soup(self.unicode_data)
self.assertEquals(soup_from_unicode.encode('utf-8'), self.utf8_data)
+ def test_hebrew(self):
+ # A real-world test to make sure we can convert ISO-8859-9 (a
+ # Hebrew encoding) to UTF-8.
+ iso_8859_8= '<HTML><HEAD><TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE></HEAD><BODY><H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\xed\xe5\xec\xf9</BODY></HTML>'
+ utf8 = '<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9</body></html>'
+ soup = self.soup(iso_8859_8, fromEncoding="iso-8859-8")
+ self.assertEquals(soup.originalEncoding, 'iso-8859-8')
+ self.assertEquals(soup.encode('utf-8'), utf8)
+
class TestSelectiveParsing(SoupTest):
@@ -58,14 +67,20 @@ class TestSelectiveParsing(SoupTest):
self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>")
-
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
- def test_smart_quote_replacement(self):
- markup = "<foo>\x92</foo>"
+ def test_smart_quotes_to_xml_entities(self):
+ markup = "<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
- self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")
+ self.assertEquals(
+ dammit.unicode, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
+
+ def test_smart_quotes_to_html_entities(self):
+ markup = "<foo>\x91\x92\x93\x94</foo>"
+ dammit = UnicodeDammit(markup, smartQuotesTo="html")
+ self.assertEquals(
+ dammit.unicode, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_detect_utf8(self):
utf8 = "\xc3\xa9"
@@ -87,7 +102,7 @@ class TestUnicodeDammit(unittest.TestCase):
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
- dammit = UnicodeDammit(utf8_data, ["iso-8859-1"])
+ dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEquals(dammit.originalEncoding, 'utf-8')
def test_ignore_invalid_codecs(self):
diff --git a/src/beautifulsoup/tests/test_strainer.py b/src/beautifulsoup/tests/test_strainer.py
index 9a91463..f078935 100644
--- a/src/beautifulsoup/tests/test_strainer.py
+++ b/src/beautifulsoup/tests/test_strainer.py
@@ -1,7 +1,7 @@
import unittest
-from helpers import SoupTest
from beautifulsoup import BeautifulSoup
from beautifulsoup.element import SoupStrainer
+from beautifulsoup.testing import SoupTest
class TestSoupStrainer(unittest.TestCase):
diff --git a/src/beautifulsoup/tests/test_tree.py b/src/beautifulsoup/tests/test_tree.py
index 42430d3..344a462 100644
--- a/src/beautifulsoup/tests/test_tree.py
+++ b/src/beautifulsoup/tests/test_tree.py
@@ -12,7 +12,7 @@ methods tested here.
import re
from beautifulsoup import BeautifulSoup
from beautifulsoup.element import SoupStrainer, Tag
-from helpers import SoupTest
+from beautifulsoup.testing import SoupTest
class TreeTest(SoupTest):