diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-26 23:47:06 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-26 23:47:06 -0500 |
commit | dc5682014c7360e723d4861d32ee933eea8fcd5d (patch) | |
tree | 021886f2b0e2158d87d33417b8b687dfb1cee923 | |
parent | ace32031ac6c9787ee46c5ab19e6f71b99cd26d3 (diff) | |
parent | d9f49a66e4a7dfd93823f2396796ed6c55f69648 (diff) |
Minor late-night tweaks.
-rw-r--r-- | TODO | 7 | ||||
-rw-r--r-- | beautifulsoup/__init__.py | 2 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 26 | ||||
-rw-r--r-- | beautifulsoup/element.py | 7 | ||||
-rw-r--r-- | tests/test_soup.py | 2 |
5 files changed, 23 insertions, 21 deletions
@@ -1,11 +1,6 @@ Bare ampersands should be converted to HTML entities upon output. -It should also be possible to, on output, convert to HTML entities any -Unicode characters found in htmlentitydefs.codepoint2name. (This -algorithm would allow me to simplify Unicode, Dammit--convert -everything to Unicode, and then convert to entities upon output, not -treating smart quotes differently from any other Unicode character -that can be represented as an entity.) +Add namespace support. XML handling: diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 53130e0..518e95f 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -112,7 +112,7 @@ class BeautifulSoup(Tag): if builder is None: if isinstance(features, basestring): features = [features] - if len(features) == 0: + if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 31dfa95..4483118 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -31,6 +31,8 @@ except ImportError: class EntitySubstitution(object): + """Substitute XML or HTML entities for the corresponding characters.""" + def _populate_class_variables(): lookup = {} characters = [] @@ -61,17 +63,20 @@ class EntitySubstitution(object): "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") - def _substitute_html_entity(self, matchobj): - entity = self.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) return "&%s;" % entity - def _substitute_xml_entity(self, matchobj): + @classmethod + def _substitute_xml_entity(cls, matchobj): """Used with a regular expression to substitute the appropriate XML entity for an XML special character.""" - entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] return "&%s;" % entity - def substitute_xml(self, value, make_quoted_attribute=False): + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. :param value: A string to be substituted. The less-than sign will @@ -117,14 +122,15 @@ class EntitySubstitution(object): # Escape angle brackets, and ampersands that aren't part of # entities. - value = self.BARE_AMPERSAND_OR_BRACKET.sub( - self._substitute_xml_entity, value) + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) if make_quoted_attribute: return quote_with + value + quote_with else: return value - def substitute_html(self, s): + @classmethod + def substitute_html(cls, s): """Replace certain Unicode characters with named HTML entities. This differs from data.encode(encoding, 'xmlcharrefreplace') @@ -135,8 +141,8 @@ class EntitySubstitution(object): character with "é" will make it more readable to some people. """ - return self.CHARACTER_TO_HTML_ENTITY_RE.sub( - self._substitute_html_entity, s) + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) class UnicodeDammit: diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 6af27a8..61ed4ab 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -11,7 +11,7 @@ from util import isList DEFAULT_OUTPUT_ENCODING = "utf-8" -class PageElement(EntitySubstitution): +class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -363,7 +363,7 @@ class NavigableString(unicode, PageElement): def output_ready(self, substitute_html_entities=False): if substitute_html_entities: - output = self.substitute_html(self) + output = EntitySubstitution.substitute_html(self) else: output = self return self.PREFIX + output + self.SUFFIX @@ -580,7 +580,8 @@ class Tag(PageElement): and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventual_encoding) - decoded = key + '=' + self.substitute_xml(val, True) + decoded = (key + '=' + + EntitySubstitution.substitute_xml(val, True)) attrs.append(decoded) close = '' closeTag = '' diff --git a/tests/test_soup.py b/tests/test_soup.py index c4d9c2c..690db39 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -19,7 +19,7 @@ class TestSelectiveParsing(SoupTest): class TestEntitySubstitution(unittest.TestCase): """Standalone tests of the EntitySubstitution class.""" def setUp(self): - self.sub = EntitySubstitution() + self.sub = EntitySubstitution def test_simple_html_substitution(self): # Unicode characters corresponding to named HTML entites |