summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 09:32:03 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2013-06-03 09:32:03 -0400
commit88699026b6f94d6d7a831e849b01b8f3582d57a4 (patch)
tree3aa59a7928bf5477d74b15e165c14a3d4f755e8f
parentd284ed9a3b16d7259303171934ade247186eb24f (diff)
Inlined some commonly called code to save a function call.
-rw-r--r--bs4/__init__.py36
-rw-r--r--bs4/dammit.py8
-rw-r--r--bs4/diagnose.py4
3 files changed, 26 insertions, 22 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 61de574..50bc67c 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -244,30 +244,34 @@ class BeautifulSoup(Tag):
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
- def _contains_only_ascii_spaces(self, s):
- """Returns true if the given string contains nothing other than ASCII spaces.
- The empty string meets this criteria.
- """
- for i in s:
- if i not in self.ASCII_SPACES:
- return False
- return True
-
def endData(self, containerClass=NavigableString):
if self.current_data:
current_data = u''.join(self.current_data)
- if (self._contains_only_ascii_spaces(current_data) and
- not self.preserve_whitespace_tag_stack):
- # Time to strip the whitespace.
- if '\n' in current_data:
- current_data = '\n'
- else:
- current_data = ' '
+
+ # If whitespace is not preserved, and this string contains
+ # nothing but ASCII spaces, replace it with a single space
+ # or newline.
+ if not self.preserve_whitespace_tag_stack:
+ strippable = True
+ for i in current_data:
+ if i not in self.ASCII_SPACES:
+ strippable = False
+ break
+ if strippable:
+ if '\n' in current_data:
+ current_data = '\n'
+ else:
+ current_data = ' '
+
+ # Reset the data collector.
self.current_data = []
+
+ # Should we add this string to the tree at all?
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
not self.parse_only.search(current_data)):
return
+
o = containerClass(current_data)
self.object_was_parsed(o)
diff --git a/bs4/dammit.py b/bs4/dammit.py
index a6b8663..c859066 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -295,15 +295,15 @@ class EncodingDetector:
beginning of the document.
"""
if search_entire_document:
- xml_endpos = html_endpos = -1
+ xml_endpos = html_endpos = len(markup)
else:
- xml_endpos = 1025
+ xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
declared_encoding = None
- declared_encoding_match = xml_encoding_re.search(markup, xml_endpos)
+ declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
- declared_encoding_match = html_meta_re.search(markup, html_endpos)
+ declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
'ascii')
diff --git a/bs4/diagnose.py b/bs4/diagnose.py
index c5a0c06..a2b405b 100644
--- a/bs4/diagnose.py
+++ b/bs4/diagnose.py
@@ -192,8 +192,8 @@ def profile(num_elements=100000, parser="lxml"):
stats.strip_dirs()
cumulative = stats.sort_stats("cumulative")
total = stats.sort_stats("time")
- cumulative.print_stats(50)
+ total.print_stats(50)
if __name__ == '__main__':
#diagnose(sys.stdin.read())
- profile()
+ profile(parser="lxml")