diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 09:32:03 -0400 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2013-06-03 09:32:03 -0400 |
commit | 88699026b6f94d6d7a831e849b01b8f3582d57a4 (patch) | |
tree | 3aa59a7928bf5477d74b15e165c14a3d4f755e8f | |
parent | d284ed9a3b16d7259303171934ade247186eb24f (diff) |
Inlined some commonly called code to save a function call.
-rw-r--r-- | bs4/__init__.py | 36 | ||||
-rw-r--r-- | bs4/dammit.py | 8 | ||||
-rw-r--r-- | bs4/diagnose.py | 4 |
3 files changed, 26 insertions, 22 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 61de574..50bc67c 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -244,30 +244,34 @@ class BeautifulSoup(Tag): if tag.name in self.builder.preserve_whitespace_tags: self.preserve_whitespace_tag_stack.append(tag) - def _contains_only_ascii_spaces(self, s): - """Returns true if the given string contains nothing other than ASCII spaces. - The empty string meets this criteria. - """ - for i in s: - if i not in self.ASCII_SPACES: - return False - return True - def endData(self, containerClass=NavigableString): if self.current_data: current_data = u''.join(self.current_data) - if (self._contains_only_ascii_spaces(current_data) and - not self.preserve_whitespace_tag_stack): - # Time to strip the whitespace. - if '\n' in current_data: - current_data = '\n' - else: - current_data = ' ' + + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. self.current_data = [] + + # Should we add this string to the tree at all? if self.parse_only and len(self.tagStack) <= 1 and \ (not self.parse_only.text or \ not self.parse_only.search(current_data)): return + o = containerClass(current_data) self.object_was_parsed(o) diff --git a/bs4/dammit.py b/bs4/dammit.py index a6b8663..c859066 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -295,15 +295,15 @@ class EncodingDetector: beginning of the document. """ if search_entire_document: - xml_endpos = html_endpos = -1 + xml_endpos = html_endpos = len(markup) else: - xml_endpos = 1025 + xml_endpos = 1024 html_endpos = max(2048, int(len(markup) * 0.05)) declared_encoding = None - declared_encoding_match = xml_encoding_re.search(markup, xml_endpos) + declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) if not declared_encoding_match and is_html: - declared_encoding_match = html_meta_re.search(markup, html_endpos) + declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( 'ascii') diff --git a/bs4/diagnose.py b/bs4/diagnose.py index c5a0c06..a2b405b 100644 --- a/bs4/diagnose.py +++ b/bs4/diagnose.py @@ -192,8 +192,8 @@ def profile(num_elements=100000, parser="lxml"): stats.strip_dirs() cumulative = stats.sort_stats("cumulative") total = stats.sort_stats("time") - cumulative.print_stats(50) + total.print_stats(50) if __name__ == '__main__': #diagnose(sys.stdin.read()) - profile() + profile(parser="lxml") |