summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO7
-rw-r--r--beautifulsoup/__init__.py2
-rw-r--r--beautifulsoup/dammit.py26
-rw-r--r--beautifulsoup/element.py7
-rw-r--r--tests/test_soup.py2
5 files changed, 23 insertions, 21 deletions
diff --git a/TODO b/TODO
index a799bbb..a6f444f 100644
--- a/TODO
+++ b/TODO
@@ -1,11 +1,6 @@
Bare ampersands should be converted to HTML entities upon output.
-It should also be possible to, on output, convert to HTML entities any
-Unicode characters found in htmlentitydefs.codepoint2name. (This
-algorithm would allow me to simplify Unicode, Dammit--convert
-everything to Unicode, and then convert to entities upon output, not
-treating smart quotes differently from any other Unicode character
-that can be represented as an entity.)
+Add namespace support.
XML handling:
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 53130e0..518e95f 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -112,7 +112,7 @@ class BeautifulSoup(Tag):
if builder is None:
if isinstance(features, basestring):
features = [features]
- if len(features) == 0:
+ if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
builder_class = builder_registry.lookup(*features)
if builder_class is None:
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
index 31dfa95..4483118 100644
--- a/beautifulsoup/dammit.py
+++ b/beautifulsoup/dammit.py
@@ -31,6 +31,8 @@ except ImportError:
class EntitySubstitution(object):
+ """Substitute XML or HTML entities for the corresponding characters."""
+
def _populate_class_variables():
lookup = {}
characters = []
@@ -61,17 +63,20 @@ class EntitySubstitution(object):
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
- def _substitute_html_entity(self, matchobj):
- entity = self.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+ @classmethod
+ def _substitute_html_entity(cls, matchobj):
+ entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
return "&%s;" % entity
- def _substitute_xml_entity(self, matchobj):
+ @classmethod
+ def _substitute_xml_entity(cls, matchobj):
"""Used with a regular expression to substitute the
appropriate XML entity for an XML special character."""
- entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+ entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
- def substitute_xml(self, value, make_quoted_attribute=False):
+ @classmethod
+ def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
@@ -117,14 +122,15 @@ class EntitySubstitution(object):
# Escape angle brackets, and ampersands that aren't part of
# entities.
- value = self.BARE_AMPERSAND_OR_BRACKET.sub(
- self._substitute_xml_entity, value)
+ value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
+ cls._substitute_xml_entity, value)
if make_quoted_attribute:
return quote_with + value + quote_with
else:
return value
- def substitute_html(self, s):
+ @classmethod
+ def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.
This differs from data.encode(encoding, 'xmlcharrefreplace')
@@ -135,8 +141,8 @@ class EntitySubstitution(object):
character with "é" will make it more readable to some
people.
"""
- return self.CHARACTER_TO_HTML_ENTITY_RE.sub(
- self._substitute_html_entity, s)
+ return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
+ cls._substitute_html_entity, s)
class UnicodeDammit:
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index 6af27a8..61ed4ab 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -11,7 +11,7 @@ from util import isList
DEFAULT_OUTPUT_ENCODING = "utf-8"
-class PageElement(EntitySubstitution):
+class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -363,7 +363,7 @@ class NavigableString(unicode, PageElement):
def output_ready(self, substitute_html_entities=False):
if substitute_html_entities:
- output = self.substitute_html(self)
+ output = EntitySubstitution.substitute_html(self)
else:
output = self
return self.PREFIX + output + self.SUFFIX
@@ -580,7 +580,8 @@ class Tag(PageElement):
and '%SOUP-ENCODING%' in val):
val = self.substituteEncoding(val, eventual_encoding)
- decoded = key + '=' + self.substitute_xml(val, True)
+ decoded = (key + '='
+ + EntitySubstitution.substitute_xml(val, True))
attrs.append(decoded)
close = ''
closeTag = ''
diff --git a/tests/test_soup.py b/tests/test_soup.py
index c4d9c2c..690db39 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -19,7 +19,7 @@ class TestSelectiveParsing(SoupTest):
class TestEntitySubstitution(unittest.TestCase):
"""Standalone tests of the EntitySubstitution class."""
def setUp(self):
- self.sub = EntitySubstitution()
+ self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites