diff options
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | beautifulsoup/__init__.py | 4 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 3 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 119 | ||||
-rw-r--r-- | beautifulsoup/element.py | 155 | ||||
-rw-r--r-- | beautifulsoup/util.py | 10 | ||||
-rw-r--r-- | tests/test_soup.py | 67 |
7 files changed, 234 insertions, 125 deletions
@@ -25,6 +25,7 @@ So have some arguments to popular methods: * BeautifulSoup(parseOnlyThese=...) -> BeautifulSoup(parse_only=...) * BeautifulSoup(fromEncoding=...) -> BeautifulSoup(from_encoding=...) + * Tag.encode(prettyPrint=...) -> Tag.encode(pretty_print=...) == Generators are now properties == diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index c998924..ce39d33 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -63,10 +63,10 @@ __all__ = ['BeautifulSoup'] import re -from util import isList, isString, buildSet +from util import isList, buildSet from builder import builder_registry from dammit import UnicodeDammit -from element import Entities, NavigableString, Tag +from element import NavigableString, Tag class BeautifulSoup(Tag): diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index b97c5f9..fb10628 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -1,7 +1,6 @@ from collections import defaultdict import re import sys -from beautifulsoup.element import Entities __all__ = [ 'HTMLTreeBuilder', @@ -73,7 +72,7 @@ class TreeBuilderRegistry(object): builder_registry = TreeBuilderRegistry() -class TreeBuilder(Entities): +class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" features = [] diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 455b0bf..67bec17 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -7,6 +7,7 @@ encoding; that's the tree builder's job. """ import codecs +from htmlentitydefs import codepoint2name import re import types @@ -21,18 +22,124 @@ try: except ImportError: chardet = None -# Both are available from http://cjkpython.i18n.org/ -# They're built in if you use Python 2.4. -try: - import cjkcodecs.aliases -except ImportError: - pass +# Available from http://cjkpython.i18n.org/. try: import iconv_codec except ImportError: pass +class EntitySubstitution(object): + + def _populate_class_variables(): + lookup = {} + characters = [] + for codepoint, name in codepoint2name.items(): + character = unichr(codepoint) + characters.append(character) + lookup[character] = name + re_definition = "[%s]" % "".join(characters) + return lookup, re.compile(re_definition) + CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = ( + _populate_class_variables()) + + + CHARACTER_TO_XML_ENTITY = { + "'" : "apos", + '"' : "quot", + "&" : "amp", + "<" : "lt", + ">" : "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + def _substitute_html_entity(self, matchobj): + entity = self.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + def _substitute_xml_entity(self, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = self.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + def substitute_xml(self, value, make_quoted_attribute=False, + destination_is_xml=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + + Ordinarily, the string will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If the string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If the string contains both single and double quotes, the + single quotes will be escaped (see `destination_is_xml`), and + the string will be quoted using single quotes. + + Welcome to "Bob's Bar" -> 'Welcome to "Bob&squot;s bar' + OR + 'Welcome to "Bob's bar' + (depending on the value of `destination_is_xml`) + + :param destination_is_xml: If destination_is_xml is True, + then when a single quote is escaped it will become + "'". But ' is not a valid HTML 4 entity. If + destination_is_xml is False, then single quotes will be + turned into "&squot;". + + The value of this argument is irrelevant unless + make_quoted_attribute is True. + """ + quote_with = '"' + if make_quoted_attribute: + if '"' in value: + quote_with = "'" + if "'" in value: + if destination_is_xml: + replace_with = "'" + else: + replace_with = "&squot;" + value = value.replace("'", replace_with) + + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = self.BARE_AMPERSAND_OR_BRACKET.sub( + self._substitute_xml_entity, value) + if make_quoted_attribute: + return quote_with + value + quote_with + else: + return value + + def substitute_html(self, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return self.CHARACTER_TO_HTML_ENTITY_RE.sub( + self._substitute_html_entity, s) + + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index a70813d..0ef9db1 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -4,32 +4,12 @@ try: from htmlentitydefs import name2codepoint except ImportError: name2codepoint = {} +from beautifulsoup.dammit import EntitySubstitution -from util import isString, isList +from util import isList DEFAULT_OUTPUT_ENCODING = "utf-8" -class Entities(object): - """A mixin class that knows about XML entities.""" - - HTML_ENTITIES = "html" - XML_ENTITIES = "xml" - XHTML_ENTITIES = "xhtml" - - def _invert(h): - "Cheap function to invert a hash." - i = {} - for k,v in h.items(): - i[v] = k - return i - - XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", - "quot" : '"', - "amp" : "&", - "lt" : "<", - "gt" : ">" } - - XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) class PageElement(object): """Contains the navigational information for some part of the page @@ -378,28 +358,28 @@ class NavigableString(unicode, PageElement): else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - def decodeGivenEventualEncoding(self, eventualEncoding): + def decodeGivenEventualEncoding(self, eventual_encoding): return self class CData(NavigableString): - def decodeGivenEventualEncoding(self, eventualEncoding): + def decodeGivenEventualEncoding(self, eventual_encoding): return u'<![CDATA[' + self + u']]>' class ProcessingInstruction(NavigableString): - def decodeGivenEventualEncoding(self, eventualEncoding): + def decodeGivenEventualEncoding(self, eventual_encoding): output = self if u'%SOUP-ENCODING%' in output: - output = self.substituteEncoding(output, eventualEncoding) + output = self.substituteEncoding(output, eventual_encoding) return u'<?' + output + u'?>' class Comment(NavigableString): - def decodeGivenEventualEncoding(self, eventualEncoding): + def decodeGivenEventualEncoding(self, eventual_encoding): return u'<!--' + self + u'-->' class Declaration(NavigableString): - def decodeGivenEventualEncoding(self, eventualEncoding): + def decodeGivenEventualEncoding(self, eventual_encoding): return u'<!' + self + u'>' class Doctype(NavigableString): @@ -414,10 +394,10 @@ class Doctype(NavigableString): return Doctype(value) - def decodeGivenEventualEncoding(self, eventualEncoding): + def decodeGivenEventualEncoding(self, eventual_encoding): return u'<!DOCTYPE ' + self + u'>' -class Tag(PageElement, Entities): +class Tag(PageElement, EntitySubstitution): """Represents a found HTML tag with its attributes and contents.""" @@ -556,7 +536,7 @@ class Tag(PageElement, Entities): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag. - NOTE: right now this will return false if two tags have the + XXX: right now this will return false if two tags have the same attributes in a different order. Should this be fixed?""" if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): return False @@ -572,16 +552,7 @@ class Tag(PageElement, Entities): def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" - return self.decode(eventualEncoding=encoding) - - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - + ")") - - def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + return self.decode(eventual_encoding=encoding) def __unicode__(self): return self.decode() @@ -590,56 +561,29 @@ class Tag(PageElement, Entities): return self.encode() def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - return self.decode(prettyPrint, indentLevel, encoding).encode(encoding) + pretty_print=False, indent_level=0): + return self.decode(pretty_print, indent_level, encoding).encode(encoding) - def decode(self, prettyPrint=False, indentLevel=0, - eventualEncoding=DEFAULT_OUTPUT_ENCODING): + def decode(self, pretty_print=False, indent_level=0, + eventual_encoding=DEFAULT_OUTPUT_ENCODING): """Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding.""" attrs = [] if self.attrs: for key, val in self.attrs: - fmt = '%s="%s"' - if isString(val): - if (self.contains_substitutions - and eventualEncoding is not None - and '%SOUP-ENCODING%' in val): - val = self.substituteEncoding(val, eventualEncoding) - - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. - if '"' in val: - fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. - val = val.replace("'", "&squot;") - - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) if val is None: - # Handle boolean attributes. decoded = key else: - decoded = fmt % (key, val) + if not isinstance(val, basestring): + val = str(val) + if (self.contains_substitutions + and eventual_encoding is not None + and '%SOUP-ENCODING%' in val): + val = self.substituteEncoding(val, eventual_encoding) + + # XXX: Set destination_is_xml based on... something! + decoded = key + '=' + self.substitute_xml(val, True, False) attrs.append(decoded) close = '' closeTag = '' @@ -649,12 +593,12 @@ class Tag(PageElement, Entities): closeTag = '</%s>' % self.name indentTag, indentContents = 0, 0 - if prettyPrint: - indentTag = indentLevel + if pretty_print: + indentTag = indent_level space = (' ' * (indentTag-1)) indentContents = indentTag + 1 - contents = self.decodeContents(prettyPrint, indentContents, - eventualEncoding) + contents = self.decodeContents(pretty_print, indentContents, + eventual_encoding) if self.hidden: s = contents else: @@ -662,18 +606,18 @@ class Tag(PageElement, Entities): attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) - if prettyPrint: + if pretty_print: s.append(space) s.append('<%s%s%s>' % (self.name, attributeString, close)) - if prettyPrint: + if pretty_print: s.append("\n") s.append(contents) - if prettyPrint and contents and contents[-1] != "\n": + if pretty_print and contents and contents[-1] != "\n": s.append("\n") - if prettyPrint and closeTag: + if pretty_print and closeTag: s.append(space) s.append(closeTag) - if prettyPrint and closeTag and self.nextSibling: + if pretty_print and closeTag and self.nextSibling: s.append("\n") s = ''.join(s) return s @@ -692,27 +636,27 @@ class Tag(PageElement, Entities): return self.encode(encoding, True) def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - return self.decodeContents(prettyPrint, indentLevel).encode(encoding) + pretty_print=False, indent_level=0): + return self.decodeContents(pretty_print, indent_level).encode(encoding) - def decodeContents(self, prettyPrint=False, indentLevel=0, - eventualEncoding=DEFAULT_OUTPUT_ENCODING): + def decodeContents(self, pretty_print=False, indent_level=0, + eventual_encoding=DEFAULT_OUTPUT_ENCODING): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] for c in self: text = None if isinstance(c, NavigableString): - text = c.decodeGivenEventualEncoding(eventualEncoding) + text = c.decodeGivenEventualEncoding(eventual_encoding) elif isinstance(c, Tag): - s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) - if text and prettyPrint: + s.append(c.decode(pretty_print, indent_level, eventual_encoding)) + if text and pretty_print: text = text.strip() if text: - if prettyPrint: - s.append(" " * (indentLevel-1)) + if pretty_print: + s.append(" " * (indent_level-1)) s.append(text) - if prettyPrint: + if pretty_print: s.append("\n") return ''.join(s) @@ -790,7 +734,7 @@ class SoupStrainer(object): def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name - if isString(attrs): + if isinstance(attrs, basestring): kwargs['class'] = attrs attrs = None if kwargs: @@ -863,7 +807,7 @@ class SoupStrainer(object): found = self.searchTag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isString(markup): + isinstance(markup, basestring): if self._matches(markup, self.text): found = markup else: @@ -883,18 +827,19 @@ class SoupStrainer(object): #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name - if markup is not None and not isString(markup): + if markup is not None and not isinstance(markup, basestring): markup = unicode(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) elif (isList(matchAgainst) - and (markup is not None or not isString(matchAgainst))): + and (markup is not None + or not isinstance(matchAgainst, basestring))): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) - elif matchAgainst and isString(markup): + elif matchAgainst and isinstance(markup, basestring): if isinstance(markup, unicode): matchAgainst = unicode(matchAgainst) else: diff --git a/beautifulsoup/util.py b/beautifulsoup/util.py index 693a7e2..5978865 100644 --- a/beautifulsoup/util.py +++ b/beautifulsoup/util.py @@ -9,17 +9,9 @@ except NameError: def isList(l): """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" - return ((hasattr(l, '__iter__') and not isString(l)) + return ((hasattr(l, '__iter__') and not isinstance(l, basestring)) or (type(l) in (types.ListType, types.TupleType))) -def isString(s): - """Convenience method that works with all 2.x versions of Python - to determine whether or not something is stringlike.""" - try: - return isinstance(s, unicode) or isinstance(s, basestring) - except NameError: - return isinstance(s, str) - def buildSet(args=None): """Turns a list or a string into a set.""" if isinstance(args, str): diff --git a/tests/test_soup.py b/tests/test_soup.py index bb2262a..eaedd94 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -3,7 +3,7 @@ import unittest from beautifulsoup.element import SoupStrainer -from beautifulsoup.dammit import UnicodeDammit +from beautifulsoup.dammit import EntitySubstitution, UnicodeDammit from beautifulsoup.testing import SoupTest @@ -16,6 +16,71 @@ class TestSelectiveParsing(SoupTest): self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>") +class TestEntitySubstitution(unittest.TestCase): + """Standalone tests of the EntitySubstitution class.""" + def setUp(self): + self.sub = EntitySubstitution() + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = u"foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEquals(self.sub.substitute_html(s), + u"foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = "\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + self.assertEquals(self.sub.substitute_html(dammit.markup), + "‘’foo“”") + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + self.assertEquals(self.sub.substitute_xml(s, False), s) + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + self.assertEquals(self.sub.substitute_xml("Welcome", True), + '"Welcome"') + self.assertEquals(self.sub.substitute_xml("Bob's Bar", True), + '"Bob\'s Bar"') + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + self.assertEquals(self.sub.substitute_xml(s, True), + "'Welcome to \"my bar\"'") + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + # This one is going into an HTML document. + self.assertEquals( + self.sub.substitute_xml(s, True), + "'Welcome to \"Bob&squot;s Bar\"'") + + # This one is going into an XML document. + self.assertEquals( + self.sub.substitute_xml(s, True, destination_is_xml=True), + "'Welcome to \"Bob's Bar\"'") + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + self.assertEquals(self.sub.substitute_xml(quoted), quoted) + + def test_xml_quoting_handles_angle_brackets(self): + self.assertEquals( + self.sub.substitute_xml("foo<bar>"), + "foo<bar>") + + def test_xml_quoting_handles_ampersands(self): + self.assertEquals(self.sub.substitute_xml("AT&T"), "AT&T") + + def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): + self.assertEquals( + self.sub.substitute_xml("ÁT&T"), + "ÁT&T") + + class TestUnicodeDammit(unittest.TestCase): """Standalone tests of Unicode, Dammit.""" |