diff options
-rw-r--r-- | bs4/__init__.py | 17 |
1 files changed, 11 insertions, 6 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 4e268ef..e24d914 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -75,11 +75,7 @@ class BeautifulSoup(Tag): # want, look for one with these features. DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } + ASCII_SPACES = '\x20\x09\x0a\x0c\x0d' def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): @@ -243,10 +239,19 @@ class BeautifulSoup(Tag): self.tagStack.append(tag) self.currentTag = self.tagStack[-1] + def _contains_only_ascii_spaces(self, s): + """Returns true if the given string contains nothing other than ASCII spaces. + The empty string meets this criteria. + """ + for i in s: + if i not in self.ASCII_SPACES: + return False + return True + def endData(self, containerClass=NavigableString): if self.current_data: current_data = u''.join(self.current_data) - if (current_data.translate(self.STRIP_ASCII_SPACES) == '' and + if (self._contains_only_ascii_spaces(current_data) and not set([tag.name for tag in self.tagStack]).intersection( self.builder.preserve_whitespace_tags)): if '\n' in current_data: |