From d7056f49c8bb3a448cec2f1a6f2de55e93c8e8d6 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 26 Feb 2011 21:26:15 -0500 Subject: First stab at HTML entity replacement. --- beautifulsoup/element.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'beautifulsoup/element.py') diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 23f8c33..f3a59d4 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -561,11 +561,14 @@ class Tag(PageElement, EntitySubstitution): return self.encode() def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decode(pretty_print, indent_level, encoding).encode(encoding) + pretty_print=False, indent_level=0, + replace_with_html_entities=False): + return self.decode(pretty_print, indent_level, encoding, + replace_with_html_entities).encode(encoding) def decode(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + replace_with_html_entities=False): """Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding.""" @@ -597,7 +600,8 @@ class Tag(PageElement, EntitySubstitution): space = (' ' * (indentTag-1)) indentContents = indentTag + 1 contents = self.decodeContents(pretty_print, indentContents, - eventual_encoding) + eventual_encoding, + replace_with_html_entities) if self.hidden: s = contents else: @@ -635,11 +639,15 @@ class Tag(PageElement, EntitySubstitution): return self.encode(encoding, True) def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0): - return self.decodeContents(pretty_print, indent_level).encode(encoding) + pretty_print=False, indent_level=0, + replace_With_html_entities=False): + return self.decodeContents( + pretty_print, indent_level, replace_with_html_entities).encode( + encoding) def decodeContents(self, pretty_print=False, indent_level=0, - eventual_encoding=DEFAULT_OUTPUT_ENCODING): + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + replace_with_html_entities=False): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] @@ -648,10 +656,13 @@ class Tag(PageElement, EntitySubstitution): if isinstance(c, NavigableString): text = c.decodeGivenEventualEncoding(eventual_encoding) elif isinstance(c, Tag): - s.append(c.decode(pretty_print, indent_level, eventual_encoding)) + s.append(c.decode(pretty_print, indent_level, eventual_encoding, + replace_with_html_entities)) if text and pretty_print: text = text.strip() if text: + if replace_with_html_entities: + text = self.substitute_html(text) if pretty_print: s.append(" " * (indent_level-1)) s.append(text) -- cgit v1.2.3 From 247785aa53358fa64e9bd5f799c4c9a1609489f0 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 26 Feb 2011 22:54:04 -0500 Subject: Renamed replace_with_html_entities to substitute_html_entities. --- beautifulsoup/element.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'beautifulsoup/element.py') diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index f3a59d4..bbecdbd 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -562,13 +562,13 @@ class Tag(PageElement, EntitySubstitution): def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, pretty_print=False, indent_level=0, - replace_with_html_entities=False): + substitute_html_entities=False): return self.decode(pretty_print, indent_level, encoding, - replace_with_html_entities).encode(encoding) + substitute_html_entities).encode(encoding) def decode(self, pretty_print=False, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - replace_with_html_entities=False): + substitute_html_entities=False): """Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding.""" @@ -601,7 +601,7 @@ class Tag(PageElement, EntitySubstitution): indentContents = indentTag + 1 contents = self.decodeContents(pretty_print, indentContents, eventual_encoding, - replace_with_html_entities) + substitute_html_entities) if self.hidden: s = contents else: @@ -642,12 +642,12 @@ class Tag(PageElement, EntitySubstitution): pretty_print=False, indent_level=0, replace_With_html_entities=False): return self.decodeContents( - pretty_print, indent_level, replace_with_html_entities).encode( + pretty_print, indent_level, substitute_html_entities).encode( encoding) def decodeContents(self, pretty_print=False, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - replace_with_html_entities=False): + substitute_html_entities=False): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] @@ -657,11 +657,11 @@ class Tag(PageElement, EntitySubstitution): text = c.decodeGivenEventualEncoding(eventual_encoding) elif isinstance(c, Tag): s.append(c.decode(pretty_print, indent_level, eventual_encoding, - replace_with_html_entities)) + substitute_html_entities)) if text and pretty_print: text = text.strip() if text: - if replace_with_html_entities: + if substitute_html_entities: text = self.substitute_html(text) if pretty_print: s.append(" " * (indent_level-1)) -- cgit v1.2.3 From cb369db2e4a16816ae78ff6eff7d8d3393141569 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 26 Feb 2011 23:05:10 -0500 Subject: Refactored the code that makes a string output-ready. --- beautifulsoup/element.py | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'beautifulsoup/element.py') diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index bbecdbd..3515d50 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -11,7 +11,7 @@ from util import isList DEFAULT_OUTPUT_ENCODING = "utf-8" -class PageElement(object): +class PageElement(EntitySubstitution): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -334,6 +334,9 @@ class PageElement(object): class NavigableString(unicode, PageElement): + PREFIX = '' + SUFFIX = '' + def __new__(cls, value): """Create a new NavigableString. @@ -358,29 +361,35 @@ class NavigableString(unicode, PageElement): else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - def decodeGivenEventualEncoding(self, eventual_encoding): - return self + def output_ready(self, substitute_html_entities=False): + if substitute_html_entities: + output = self.substitute_html(self) + else: + output = self + return self.PREFIX + output + self.SUFFIX + class CData(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'' + PREFIX = u'' + class ProcessingInstruction(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - output = self - if u'%SOUP-ENCODING%' in output: - output = self.substituteEncoding(output, eventual_encoding) - return u'' + PREFIX = u'' + class Comment(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'' + + PREFIX = u'' class Declaration(NavigableString): - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'' + PREFIX = u'' + class Doctype(NavigableString): @@ -394,10 +403,11 @@ class Doctype(NavigableString): return Doctype(value) - def decodeGivenEventualEncoding(self, eventual_encoding): - return u'' + PREFIX = u'' + -class Tag(PageElement, EntitySubstitution): +class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" @@ -654,15 +664,13 @@ class Tag(PageElement, EntitySubstitution): for c in self: text = None if isinstance(c, NavigableString): - text = c.decodeGivenEventualEncoding(eventual_encoding) + text = c.output_ready(substitute_html_entities) elif isinstance(c, Tag): s.append(c.decode(pretty_print, indent_level, eventual_encoding, substitute_html_entities)) if text and pretty_print: text = text.strip() if text: - if substitute_html_entities: - text = self.substitute_html(text) if pretty_print: s.append(" " * (indent_level-1)) s.append(text) -- cgit v1.2.3 From a5f2ee72f16d0b6755ab84ddf2eff8e3d412e755 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 26 Feb 2011 23:11:54 -0500 Subject: Cleaned up decodeContents, and removed encodeContents, which isn't used. --- beautifulsoup/element.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'beautifulsoup/element.py') diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 3515d50..e6e7adb 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -609,7 +609,7 @@ class Tag(PageElement): indentTag = indent_level space = (' ' * (indentTag-1)) indentContents = indentTag + 1 - contents = self.decodeContents(pretty_print, indentContents, + contents = self.decode_contents(pretty_print, indentContents, eventual_encoding, substitute_html_entities) if self.hidden: @@ -648,18 +648,18 @@ class Tag(PageElement): def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.encode(encoding, True) - def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0, - replace_With_html_entities=False): - return self.decodeContents( - pretty_print, indent_level, substitute_html_entities).encode( - encoding) - - def decodeContents(self, pretty_print=False, indent_level=0, + def decode_contents(self, pretty_print=False, indent_level=0, eventual_encoding=DEFAULT_OUTPUT_ENCODING, substitute_html_entities=False): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" + """Renders the contents of this tag as a Unicode string. + + :param eventual_encoding: The document is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ s=[] for c in self: text = None -- cgit v1.2.3 From 5082ca97607846f3be18859f1b803ac15ef77083 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 26 Feb 2011 23:19:10 -0500 Subject: Removed the redundant pretty_print argument except for the top level. --- beautifulsoup/element.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'beautifulsoup/element.py') diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index e6e7adb..519a9da 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -571,12 +571,11 @@ class Tag(PageElement): return self.encode() def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - pretty_print=False, indent_level=0, - substitute_html_entities=False): - return self.decode(pretty_print, indent_level, encoding, + indent_level=None, substitute_html_entities=False): + return self.decode(indent_level, encoding, substitute_html_entities).encode(encoding) - def decode(self, pretty_print=False, indent_level=0, + def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, substitute_html_entities=False): """Returns a string or Unicode representation of this tag and @@ -604,15 +603,18 @@ class Tag(PageElement): else: closeTag = '' % self.name - indentTag, indentContents = 0, 0 + pretty_print = (indent_level is not None) if pretty_print: - indentTag = indent_level - space = (' ' * (indentTag-1)) - indentContents = indentTag + 1 - contents = self.decode_contents(pretty_print, indentContents, - eventual_encoding, - substitute_html_entities) + space = (' ' * (indent_level-1)) + indent_contents = indent_level + 1 + else: + space = '' + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, substitute_html_entities) + if self.hidden: + # This is the 'document root' object. s = contents else: s = [] @@ -648,7 +650,7 @@ class Tag(PageElement): def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.encode(encoding, True) - def decode_contents(self, pretty_print=False, indent_level=0, + def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, substitute_html_entities=False): """Renders the contents of this tag as a Unicode string. @@ -660,15 +662,16 @@ class Tag(PageElement): document contains a tag that mentions the document's encoding. """ + pretty_print = (indent_level is not None) s=[] for c in self: text = None if isinstance(c, NavigableString): text = c.output_ready(substitute_html_entities) elif isinstance(c, Tag): - s.append(c.decode(pretty_print, indent_level, eventual_encoding, + s.append(c.decode(indent_level, eventual_encoding, substitute_html_entities)) - if text and pretty_print: + if text and indent_level: text = text.strip() if text: if pretty_print: -- cgit v1.2.3 From 1330639ff7ae099ce80e77a8b6be6a0d75b60f04 Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 26 Feb 2011 23:22:42 -0500 Subject: Minor cleanup and adding docstrings. --- beautifulsoup/element.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'beautifulsoup/element.py') diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index 519a9da..d37124f 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -562,7 +562,7 @@ class Tag(PageElement): def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" - return self.decode(eventual_encoding=encoding) + return self.encode(encoding) def __unicode__(self): return self.decode() @@ -578,9 +578,15 @@ class Tag(PageElement): def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, substitute_html_entities=False): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding.""" + """Returns a Unicode representation of this tag and its contents. + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ attrs = [] if self.attrs: for key, val in self.attrs: @@ -655,7 +661,7 @@ class Tag(PageElement): substitute_html_entities=False): """Renders the contents of this tag as a Unicode string. - :param eventual_encoding: The document is destined to be + :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the -- cgit v1.2.3 From 8a6e1b5e15368c9dd66b6b407b7328c2bd0360ad Mon Sep 17 00:00:00 2001 From: Leonard Richardson Date: Sat, 26 Feb 2011 23:39:06 -0500 Subject: The attribute list comes in as a dictionary, so stop turning it into a list for no reason. Saves code and a little time. Sort outgoing attributes so that the tests will run consistently. --- beautifulsoup/element.py | 51 ++++++++++-------------------------------------- 1 file changed, 10 insertions(+), 41 deletions(-) (limited to 'beautifulsoup/element.py') diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index d37124f..6af27a8 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -420,19 +420,14 @@ class Tag(PageElement): self.parserClass = parser.__class__ self.name = name if attrs == None: - attrs = [] - if isinstance(attrs, types.DictType): - self.attrMap = attrs + attrs = {} + else: + attrs = dict(attrs) self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False - if isinstance(attrs, types.DictType): - self.attrs = [kv for kv in attrs.items()] - else: - self.attrs = list(attrs) - # Set up any substitutions, such as the charset in a META tag. self.contains_substitutions = builder.set_up_substitutions(self) @@ -478,15 +473,15 @@ class Tag(PageElement): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" - return self._getAttrMap().get(key, default) + return self.attrs.get(key, default) def has_key(self, key): - return self._getAttrMap().has_key(key) + return self.attrs.has_key(key) def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" - return self._getAttrMap()[key] + return self.attrs[key] def __iter__(self): "Iterating over a tag iterates over its contents." @@ -506,27 +501,12 @@ class Tag(PageElement): def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value + self.attrs[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] + if self.attrs.has_key(key): + del self.attrs[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its @@ -589,7 +569,7 @@ class Tag(PageElement): """ attrs = [] if self.attrs: - for key, val in self.attrs: + for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: @@ -718,17 +698,6 @@ class Tag(PageElement): findAll = find_all # BS3 findChildren = find_all # BS2 - #Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - #Generator methods @property def children(self): -- cgit v1.2.3